summaryrefslogtreecommitdiffstats
path: root/src/shared
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:49:52 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:49:52 +0000
commit55944e5e40b1be2afc4855d8d2baf4b73d1876b5 (patch)
tree33f869f55a1b149e9b7c2b7e201867ca5dd52992 /src/shared
parentInitial commit. (diff)
downloadsystemd-55944e5e40b1be2afc4855d8d2baf4b73d1876b5.tar.xz
systemd-55944e5e40b1be2afc4855d8d2baf4b73d1876b5.zip
Adding upstream version 255.4.upstream/255.4
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/shared')
-rw-r--r--src/shared/acl-util.c652
-rw-r--r--src/shared/acl-util.h60
-rw-r--r--src/shared/acpi-fpdt.c187
-rw-r--r--src/shared/acpi-fpdt.h6
-rw-r--r--src/shared/apparmor-util.c22
-rw-r--r--src/shared/apparmor-util.h6
-rw-r--r--src/shared/ask-password-api.c1002
-rw-r--r--src/shared/ask-password-api.h23
-rw-r--r--src/shared/async.c137
-rw-r--r--src/shared/async.h26
-rw-r--r--src/shared/barrier.c394
-rw-r--r--src/shared/barrier.h74
-rw-r--r--src/shared/base-filesystem.c210
-rw-r--r--src/shared/base-filesystem.h7
-rw-r--r--src/shared/battery-util.c283
-rw-r--r--src/shared/battery-util.h11
-rw-r--r--src/shared/binfmt-util.c55
-rw-r--r--src/shared/binfmt-util.h5
-rw-r--r--src/shared/bitmap.c211
-rw-r--r--src/shared/bitmap.h36
-rw-r--r--src/shared/blkid-util.h47
-rw-r--r--src/shared/blockdev-util.c828
-rw-r--r--src/shared/blockdev-util.h61
-rw-r--r--src/shared/bond-util.c73
-rw-r--r--src/shared/bond-util.h106
-rw-r--r--src/shared/boot-entry.c273
-rw-r--r--src/shared/boot-entry.h35
-rw-r--r--src/shared/boot-timestamps.c46
-rw-r--r--src/shared/boot-timestamps.h6
-rw-r--r--src/shared/bootspec.c1434
-rw-r--r--src/shared/bootspec.h129
-rw-r--r--src/shared/bpf-compat.h54
-rw-r--r--src/shared/bpf-dlopen.c146
-rw-r--r--src/shared/bpf-dlopen.h34
-rw-r--r--src/shared/bpf-link.c43
-rw-r--r--src/shared/bpf-link.h16
-rw-r--r--src/shared/bpf-program.c513
-rw-r--r--src/shared/bpf-program.h65
-rw-r--r--src/shared/bridge-util.c13
-rw-r--r--src/shared/bridge-util.h20
-rw-r--r--src/shared/btrfs-util.c2164
-rw-r--r--src/shared/btrfs-util.h149
-rw-r--r--src/shared/bus-get-properties.c166
-rw-r--r--src/shared/bus-get-properties.h101
-rw-r--r--src/shared/bus-locator.c231
-rw-r--r--src/shared/bus-locator.h37
-rw-r--r--src/shared/bus-log-control-api.c114
-rw-r--r--src/shared/bus-log-control-api.h19
-rw-r--r--src/shared/bus-map-properties.c251
-rw-r--r--src/shared/bus-map-properties.h25
-rw-r--r--src/shared/bus-message-util.c185
-rw-r--r--src/shared/bus-message-util.h18
-rw-r--r--src/shared/bus-object.c177
-rw-r--r--src/shared/bus-object.h34
-rw-r--r--src/shared/bus-polkit.c575
-rw-r--r--src/shared/bus-polkit.h11
-rw-r--r--src/shared/bus-print-properties.c440
-rw-r--r--src/shared/bus-print-properties.h21
-rw-r--r--src/shared/bus-unit-procs.c402
-rw-r--r--src/shared/bus-unit-procs.h8
-rw-r--r--src/shared/bus-unit-util.c2938
-rw-r--r--src/shared/bus-unit-util.h37
-rw-r--r--src/shared/bus-util.c711
-rw-r--r--src/shared/bus-util.h75
-rw-r--r--src/shared/bus-wait-for-jobs.c333
-rw-r--r--src/shared/bus-wait-for-jobs.h16
-rw-r--r--src/shared/bus-wait-for-units.c426
-rw-r--r--src/shared/bus-wait-for-units.h35
-rw-r--r--src/shared/calendarspec.c1435
-rw-r--r--src/shared/calendarspec.h44
-rw-r--r--src/shared/cgroup-setup.c1008
-rw-r--r--src/shared/cgroup-setup.h38
-rw-r--r--src/shared/cgroup-show.c471
-rw-r--r--src/shared/cgroup-show.h24
-rw-r--r--src/shared/chown-recursive.c177
-rw-r--r--src/shared/chown-recursive.h8
-rw-r--r--src/shared/clean-ipc.c452
-rw-r--r--src/shared/clean-ipc.h17
-rw-r--r--src/shared/clock-util.c167
-rw-r--r--src/shared/clock-util.h20
-rw-r--r--src/shared/common-signal.c85
-rw-r--r--src/shared/common-signal.h63
-rw-r--r--src/shared/compare-operator.c119
-rw-r--r--src/shared/compare-operator.h62
-rw-r--r--src/shared/condition.c1360
-rw-r--r--src/shared/condition.h113
-rw-r--r--src/shared/conf-parser.c1984
-rw-r--r--src/shared/conf-parser.h481
-rw-r--r--src/shared/copy.c1635
-rw-r--r--src/shared/copy.h106
-rw-r--r--src/shared/coredump-util.c179
-rw-r--r--src/shared/coredump-util.h43
-rw-r--r--src/shared/cpu-set-util.c292
-rw-r--r--src/shared/cpu-set-util.h52
-rw-r--r--src/shared/creds-util.c1395
-rw-r--r--src/shared/creds-util.h79
-rw-r--r--src/shared/cryptsetup-fido2.c276
-rw-r--r--src/shared/cryptsetup-fido2.h82
-rw-r--r--src/shared/cryptsetup-util.c349
-rw-r--r--src/shared/cryptsetup-util.h111
-rw-r--r--src/shared/daemon-util.c76
-rw-r--r--src/shared/daemon-util.h28
-rw-r--r--src/shared/data-fd-util.c391
-rw-r--r--src/shared/data-fd-util.h16
-rw-r--r--src/shared/dev-setup.c137
-rw-r--r--src/shared/dev-setup.h10
-rw-r--r--src/shared/device-nodes.c87
-rw-r--r--src/shared/device-nodes.h9
-rw-r--r--src/shared/devnode-acl.c226
-rw-r--r--src/shared/devnode-acl.h34
-rw-r--r--src/shared/discover-image.c1385
-rw-r--r--src/shared/discover-image.h122
-rw-r--r--src/shared/dissect-image.c4069
-rw-r--r--src/shared/dissect-image.h230
-rw-r--r--src/shared/dlfcn-util.c64
-rw-r--r--src/shared/dlfcn-util.h39
-rw-r--r--src/shared/dm-util.c45
-rw-r--r--src/shared/dm-util.h4
-rw-r--r--src/shared/dns-domain.c1421
-rw-r--r--src/shared/dns-domain.h104
-rw-r--r--src/shared/dropin.c278
-rw-r--r--src/shared/dropin.h26
-rw-r--r--src/shared/edit-util.c370
-rw-r--r--src/shared/edit-util.h40
-rw-r--r--src/shared/efi-api.c556
-rw-r--r--src/shared/efi-api.h74
-rw-r--r--src/shared/efi-loader.c363
-rw-r--r--src/shared/efi-loader.h63
-rw-r--r--src/shared/elf-util.c899
-rw-r--r--src/shared/elf-util.h18
-rw-r--r--src/shared/enable-mempool.c19
-rw-r--r--src/shared/env-file-label.c35
-rw-r--r--src/shared/env-file-label.h10
-rw-r--r--src/shared/ethtool-link-mode.py61
-rw-r--r--src/shared/ethtool-util.c1423
-rw-r--r--src/shared/ethtool-util.h205
-rw-r--r--src/shared/exec-util.c605
-rw-r--r--src/shared/exec-util.h64
-rw-r--r--src/shared/exit-status.c179
-rw-r--r--src/shared/exit-status.h113
-rw-r--r--src/shared/extension-util.c166
-rw-r--r--src/shared/extension-util.h23
-rw-r--r--src/shared/fdisk-util.c163
-rw-r--r--src/shared/fdisk-util.h25
-rw-r--r--src/shared/fdset.c323
-rw-r--r--src/shared/fdset.h47
-rw-r--r--src/shared/fileio-label.c43
-rw-r--r--src/shared/fileio-label.h15
-rw-r--r--src/shared/find-esp.c909
-rw-r--r--src/shared/find-esp.h15
-rw-r--r--src/shared/firewall-util-iptables.c392
-rw-r--r--src/shared/firewall-util-nft.c1372
-rw-r--r--src/shared/firewall-util-private.h69
-rw-r--r--src/shared/firewall-util.c160
-rw-r--r--src/shared/firewall-util.h104
-rw-r--r--src/shared/format-table.c3061
-rw-r--r--src/shared/format-table.h165
-rw-r--r--src/shared/fsck-util.h14
-rw-r--r--src/shared/fstab-util.c366
-rw-r--r--src/shared/fstab-util.h59
-rwxr-xr-xsrc/shared/generate-ip-protocol-list.sh9
-rwxr-xr-xsrc/shared/generate-syscall-list.py7
-rw-r--r--src/shared/generator.c888
-rw-r--r--src/shared/generator.h105
-rw-r--r--src/shared/geneve-util.c12
-rw-r--r--src/shared/geneve-util.h17
-rw-r--r--src/shared/gpt.c361
-rw-r--r--src/shared/gpt.h102
-rw-r--r--src/shared/group-record.c347
-rw-r--r--src/shared/group-record.h46
-rw-r--r--src/shared/hibernate-util.c520
-rw-r--r--src/shared/hibernate-util.h26
-rw-r--r--src/shared/hostname-setup.c213
-rw-r--r--src/shared/hostname-setup.h25
-rw-r--r--src/shared/hwdb-util.c712
-rw-r--r--src/shared/hwdb-util.h10
-rw-r--r--src/shared/id128-print.c74
-rw-r--r--src/shared/id128-print.h19
-rw-r--r--src/shared/idn-util.c69
-rw-r--r--src/shared/idn-util.h32
-rw-r--r--src/shared/ima-util.c15
-rw-r--r--src/shared/ima-util.h6
-rw-r--r--src/shared/image-policy.c774
-rw-r--r--src/shared/image-policy.h104
-rw-r--r--src/shared/import-util.c233
-rw-r--r--src/shared/import-util.h36
-rw-r--r--src/shared/in-addr-prefix-util.c325
-rw-r--r--src/shared/in-addr-prefix-util.h23
-rw-r--r--src/shared/initreq.h74
-rw-r--r--src/shared/install-file.c270
-rw-r--r--src/shared/install-file.h14
-rw-r--r--src/shared/install-printf.c125
-rw-r--r--src/shared/install-printf.h11
-rw-r--r--src/shared/install.c3760
-rw-r--r--src/shared/install.h244
-rw-r--r--src/shared/ip-protocol-list.c84
-rw-r--r--src/shared/ip-protocol-list.h14
-rw-r--r--src/shared/ip-protocol-to-name.awk11
-rw-r--r--src/shared/ipvlan-util.c22
-rw-r--r--src/shared/ipvlan-util.h29
-rw-r--r--src/shared/journal-file-util.c534
-rw-r--r--src/shared/journal-file-util.h29
-rw-r--r--src/shared/journal-importer.c482
-rw-r--r--src/shared/journal-importer.h60
-rw-r--r--src/shared/journal-util.c188
-rw-r--r--src/shared/journal-util.h11
-rw-r--r--src/shared/json-internal.h76
-rw-r--r--src/shared/json.c5132
-rw-r--r--src/shared/json.h474
-rw-r--r--src/shared/kbd-util.c155
-rw-r--r--src/shared/kbd-util.h13
-rw-r--r--src/shared/kernel-image.c178
-rw-r--r--src/shared/kernel-image.h24
-rw-r--r--src/shared/keyring-util.c35
-rw-r--r--src/shared/keyring-util.h11
-rw-r--r--src/shared/killall.c319
-rw-r--r--src/shared/killall.h6
-rw-r--r--src/shared/label-util.c141
-rw-r--r--src/shared/label-util.h29
-rw-r--r--src/shared/libcrypt-util.c211
-rw-r--r--src/shared/libcrypt-util.h13
-rw-r--r--src/shared/libfido2-util.c1296
-rw-r--r--src/shared/libfido2-util.h131
-rw-r--r--src/shared/libmount-util.c59
-rw-r--r--src/shared/libmount-util.h20
-rw-r--r--src/shared/libshared.sym3
-rw-r--r--src/shared/linux/README9
-rw-r--r--src/shared/linux/auto_dev-ioctl.h220
-rw-r--r--src/shared/linux/bpf.h7053
-rw-r--r--src/shared/linux/bpf_common.h57
-rw-r--r--src/shared/linux/bpf_insn.h241
-rw-r--r--src/shared/linux/dm-ioctl.h385
-rw-r--r--src/shared/linux/ethtool.h2164
-rw-r--r--src/shared/local-addresses.c506
-rw-r--r--src/shared/local-addresses.h19
-rw-r--r--src/shared/locale-setup.c294
-rw-r--r--src/shared/locale-setup.h29
-rw-r--r--src/shared/log-link.h59
-rw-r--r--src/shared/logs-show.c2102
-rw-r--r--src/shared/logs-show.h77
-rw-r--r--src/shared/loop-util.c1209
-rw-r--r--src/shared/loop-util.h59
-rw-r--r--src/shared/loopback-setup.c232
-rw-r--r--src/shared/loopback-setup.h4
-rw-r--r--src/shared/lsm-util.c33
-rw-r--r--src/shared/lsm-util.h4
-rw-r--r--src/shared/machine-credential.c127
-rw-r--r--src/shared/machine-credential.h14
-rw-r--r--src/shared/machine-id-setup.c295
-rw-r--r--src/shared/machine-id-setup.h7
-rw-r--r--src/shared/machine-pool.c51
-rw-r--r--src/shared/machine-pool.h8
-rw-r--r--src/shared/macvlan-util.c15
-rw-r--r--src/shared/macvlan-util.h17
-rw-r--r--src/shared/main-func.h42
-rw-r--r--src/shared/meson.build375
-rw-r--r--src/shared/mkdir-label.c42
-rw-r--r--src/shared/mkdir-label.h26
-rw-r--r--src/shared/mkfs-util.c684
-rw-r--r--src/shared/mkfs-util.h25
-rw-r--r--src/shared/module-util.c124
-rw-r--r--src/shared/module-util.h12
-rw-r--r--src/shared/mount-setup.c591
-rw-r--r--src/shared/mount-setup.h12
-rw-r--r--src/shared/mount-util.c1785
-rw-r--r--src/shared/mount-util.h143
-rw-r--r--src/shared/net-condition.c399
-rw-r--r--src/shared/net-condition.h47
-rw-r--r--src/shared/netif-naming-scheme.c103
-rw-r--r--src/shared/netif-naming-scheme.h97
-rw-r--r--src/shared/netif-sriov.c643
-rw-r--r--src/shared/netif-sriov.h50
-rw-r--r--src/shared/netif-util.c206
-rw-r--r--src/shared/netif-util.h22
-rw-r--r--src/shared/nscd-flush.c142
-rw-r--r--src/shared/nscd-flush.h8
-rw-r--r--src/shared/nsflags.c67
-rw-r--r--src/shared/nsflags.h23
-rw-r--r--src/shared/numa-util.c188
-rw-r--r--src/shared/numa-util.h35
-rw-r--r--src/shared/open-file.c147
-rw-r--r--src/shared/open-file.h36
-rw-r--r--src/shared/openssl-util.c1149
-rw-r--r--src/shared/openssl-util.h167
-rw-r--r--src/shared/output-mode.c43
-rw-r--r--src/shared/output-mode.h57
-rw-r--r--src/shared/pager.c330
-rw-r--r--src/shared/pager.h17
-rw-r--r--src/shared/pam-util.c211
-rw-r--r--src/shared/pam-util.h41
-rw-r--r--src/shared/parse-argument.c123
-rw-r--r--src/shared/parse-argument.h9
-rw-r--r--src/shared/parse-helpers.c237
-rw-r--r--src/shared/parse-helpers.h38
-rw-r--r--src/shared/password-quality-util-passwdqc.c142
-rw-r--r--src/shared/password-quality-util-passwdqc.h23
-rw-r--r--src/shared/password-quality-util-pwquality.c163
-rw-r--r--src/shared/password-quality-util-pwquality.h27
-rw-r--r--src/shared/password-quality-util.h30
-rw-r--r--src/shared/pcre2-util.c166
-rw-r--r--src/shared/pcre2-util.h44
-rw-r--r--src/shared/pcrextend-util.c152
-rw-r--r--src/shared/pcrextend-util.h5
-rw-r--r--src/shared/pe-binary.c241
-rw-r--r--src/shared/pe-binary.h144
-rw-r--r--src/shared/pkcs11-util.c1371
-rw-r--r--src/shared/pkcs11-util.h111
-rw-r--r--src/shared/plymouth-util.c33
-rw-r--r--src/shared/plymouth-util.h13
-rw-r--r--src/shared/pretty-print.c421
-rw-r--r--src/shared/pretty-print.h49
-rw-r--r--src/shared/ptyfwd.c677
-rw-r--r--src/shared/ptyfwd.h42
-rw-r--r--src/shared/qrcode-util.c221
-rw-r--r--src/shared/qrcode-util.h22
-rw-r--r--src/shared/quota-util.c42
-rw-r--r--src/shared/quota-util.h19
-rw-r--r--src/shared/reboot-util.c196
-rw-r--r--src/shared/reboot-util.h17
-rw-r--r--src/shared/recovery-key.c109
-rw-r--r--src/shared/recovery-key.h16
-rw-r--r--src/shared/resize-fs.c126
-rw-r--r--src/shared/resize-fs.h17
-rw-r--r--src/shared/resolve-util.c52
-rw-r--r--src/shared/resolve-util.h99
-rw-r--r--src/shared/rm-rf.c519
-rw-r--r--src/shared/rm-rf.h59
-rw-r--r--src/shared/seccomp-util.c2499
-rw-r--r--src/shared/seccomp-util.h180
-rw-r--r--src/shared/securebits-util.c66
-rw-r--r--src/shared/securebits-util.h20
-rw-r--r--src/shared/selinux-util.c762
-rw-r--r--src/shared/selinux-util.h50
-rw-r--r--src/shared/serialize.c552
-rw-r--r--src/shared/serialize.h53
-rw-r--r--src/shared/service-util.c87
-rw-r--r--src/shared/service-util.h10
-rw-r--r--src/shared/sleep-config.c390
-rw-r--r--src/shared/sleep-config.h59
-rw-r--r--src/shared/smack-util.c311
-rw-r--r--src/shared/smack-util.h53
-rw-r--r--src/shared/socket-label.c132
-rw-r--r--src/shared/socket-netlink.c409
-rw-r--r--src/shared/socket-netlink.h44
-rw-r--r--src/shared/spawn-ask-password-agent.c59
-rw-r--r--src/shared/spawn-ask-password-agent.h11
-rw-r--r--src/shared/spawn-polkit-agent.c96
-rw-r--r--src/shared/spawn-polkit-agent.h11
-rw-r--r--src/shared/specifier.c498
-rw-r--r--src/shared/specifier.h108
-rw-r--r--src/shared/switch-root.c212
-rw-r--r--src/shared/switch-root.h13
-rw-r--r--src/shared/test-tables.h43
-rw-r--r--src/shared/tests.c346
-rw-r--r--src/shared/tests.h181
-rw-r--r--src/shared/tmpfile-util-label.c30
-rw-r--r--src/shared/tmpfile-util-label.h14
-rw-r--r--src/shared/tomoyo-util.c15
-rw-r--r--src/shared/tomoyo-util.h6
-rw-r--r--src/shared/tpm2-event-log.c67
-rw-r--r--src/shared/tpm2-event-log.h139
-rw-r--r--src/shared/tpm2-util.c7664
-rw-r--r--src/shared/tpm2-util.h478
-rw-r--r--src/shared/udev-util.c439
-rw-r--r--src/shared/udev-util.h34
-rw-r--r--src/shared/user-record-nss.c529
-rw-r--r--src/shared/user-record-nss.h24
-rw-r--r--src/shared/user-record-show.c601
-rw-r--r--src/shared/user-record-show.h10
-rw-r--r--src/shared/user-record.c2319
-rw-r--r--src/shared/user-record.h450
-rw-r--r--src/shared/userdb-dropin.c304
-rw-r--r--src/shared/userdb-dropin.h22
-rw-r--r--src/shared/userdb.c1465
-rw-r--r--src/shared/userdb.h58
-rw-r--r--src/shared/utmp-wtmp.c278
-rw-r--r--src/shared/utmp-wtmp.h52
-rw-r--r--src/shared/varlink-idl.c1603
-rw-r--r--src/shared/varlink-idl.h158
-rw-r--r--src/shared/varlink-internal.h10
-rw-r--r--src/shared/varlink-io.systemd.Journal.c19
-rw-r--r--src/shared/varlink-io.systemd.Journal.h6
-rw-r--r--src/shared/varlink-io.systemd.ManagedOOM.c23
-rw-r--r--src/shared/varlink-io.systemd.ManagedOOM.h6
-rw-r--r--src/shared/varlink-io.systemd.PCRExtend.c14
-rw-r--r--src/shared/varlink-io.systemd.PCRExtend.h6
-rw-r--r--src/shared/varlink-io.systemd.Resolve.Monitor.c176
-rw-r--r--src/shared/varlink-io.systemd.Resolve.Monitor.h6
-rw-r--r--src/shared/varlink-io.systemd.Resolve.c76
-rw-r--r--src/shared/varlink-io.systemd.Resolve.h6
-rw-r--r--src/shared/varlink-io.systemd.UserDatabase.c46
-rw-r--r--src/shared/varlink-io.systemd.UserDatabase.h6
-rw-r--r--src/shared/varlink-io.systemd.c21
-rw-r--r--src/shared/varlink-io.systemd.h6
-rw-r--r--src/shared/varlink-io.systemd.oom.c25
-rw-r--r--src/shared/varlink-io.systemd.oom.h7
-rw-r--r--src/shared/varlink-io.systemd.service.c70
-rw-r--r--src/shared/varlink-io.systemd.service.h10
-rw-r--r--src/shared/varlink-io.systemd.sysext.c67
-rw-r--r--src/shared/varlink-io.systemd.sysext.h6
-rw-r--r--src/shared/varlink-org.varlink.service.c49
-rw-r--r--src/shared/varlink-org.varlink.service.h6
-rw-r--r--src/shared/varlink.c3767
-rw-r--r--src/shared/varlink.h224
-rw-r--r--src/shared/verb-log-control.c51
-rw-r--r--src/shared/verb-log-control.h8
-rw-r--r--src/shared/verbs.c171
-rw-r--r--src/shared/verbs.h23
-rw-r--r--src/shared/vlan-util.c98
-rw-r--r--src/shared/vlan-util.h21
-rw-r--r--src/shared/volatile-util.c46
-rw-r--r--src/shared/volatile-util.h16
-rw-r--r--src/shared/wall.c187
-rw-r--r--src/shared/wall.h27
-rw-r--r--src/shared/watchdog.c504
-rw-r--r--src/shared/watchdog.h21
-rw-r--r--src/shared/web-util.c66
-rw-r--r--src/shared/web-util.h13
-rw-r--r--src/shared/wifi-util.c306
-rw-r--r--src/shared/wifi-util.h16
-rw-r--r--src/shared/xml.c237
-rw-r--r--src/shared/xml.h14
422 files changed, 130213 insertions, 0 deletions
diff --git a/src/shared/acl-util.c b/src/shared/acl-util.c
new file mode 100644
index 0000000..7bfe025
--- /dev/null
+++ b/src/shared/acl-util.c
@@ -0,0 +1,652 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "acl-util.h"
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+#if HAVE_ACL
+
+int acl_find_uid(acl_t acl, uid_t uid, acl_entry_t *ret_entry) {
+ acl_entry_t i;
+ int r;
+
+ assert(acl);
+ assert(uid_is_valid(uid));
+ assert(ret_entry);
+
+ for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
+
+ acl_tag_t tag;
+ uid_t *u;
+ bool b;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if (tag != ACL_USER)
+ continue;
+
+ u = acl_get_qualifier(i);
+ if (!u)
+ return -errno;
+
+ b = *u == uid;
+ acl_free(u);
+
+ if (b) {
+ *ret_entry = i;
+ return 1;
+ }
+ }
+ if (r < 0)
+ return -errno;
+
+ *ret_entry = NULL;
+ return 0;
+}
+
+int calc_acl_mask_if_needed(acl_t *acl_p) {
+ acl_entry_t i;
+ int r;
+ bool need = false;
+
+ assert(acl_p);
+
+ for (r = acl_get_entry(*acl_p, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(*acl_p, ACL_NEXT_ENTRY, &i)) {
+ acl_tag_t tag;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if (tag == ACL_MASK)
+ return 0;
+
+ if (IN_SET(tag, ACL_USER, ACL_GROUP))
+ need = true;
+ }
+ if (r < 0)
+ return -errno;
+
+ if (need && acl_calc_mask(acl_p) < 0)
+ return -errno;
+
+ return need;
+}
+
+int add_base_acls_if_needed(acl_t *acl_p, const char *path) {
+ acl_entry_t i;
+ int r;
+ bool have_user_obj = false, have_group_obj = false, have_other = false;
+ struct stat st;
+ _cleanup_(acl_freep) acl_t basic = NULL;
+
+ assert(acl_p);
+ assert(path);
+
+ for (r = acl_get_entry(*acl_p, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(*acl_p, ACL_NEXT_ENTRY, &i)) {
+ acl_tag_t tag;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if (tag == ACL_USER_OBJ)
+ have_user_obj = true;
+ else if (tag == ACL_GROUP_OBJ)
+ have_group_obj = true;
+ else if (tag == ACL_OTHER)
+ have_other = true;
+ if (have_user_obj && have_group_obj && have_other)
+ return 0;
+ }
+ if (r < 0)
+ return -errno;
+
+ r = stat(path, &st);
+ if (r < 0)
+ return -errno;
+
+ basic = acl_from_mode(st.st_mode);
+ if (!basic)
+ return -errno;
+
+ for (r = acl_get_entry(basic, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(basic, ACL_NEXT_ENTRY, &i)) {
+ acl_tag_t tag;
+ acl_entry_t dst;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if ((tag == ACL_USER_OBJ && have_user_obj) ||
+ (tag == ACL_GROUP_OBJ && have_group_obj) ||
+ (tag == ACL_OTHER && have_other))
+ continue;
+
+ r = acl_create_entry(acl_p, &dst);
+ if (r < 0)
+ return -errno;
+
+ r = acl_copy_entry(dst, i);
+ if (r < 0)
+ return -errno;
+ }
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int acl_search_groups(const char *path, char ***ret_groups) {
+ _cleanup_strv_free_ char **g = NULL;
+ _cleanup_(acl_freep) acl_t acl = NULL;
+ bool ret = false;
+ acl_entry_t entry;
+ int r;
+
+ assert(path);
+
+ acl = acl_get_file(path, ACL_TYPE_DEFAULT);
+ if (!acl)
+ return -errno;
+
+ r = acl_get_entry(acl, ACL_FIRST_ENTRY, &entry);
+ for (;;) {
+ _cleanup_(acl_free_gid_tpp) gid_t *gid = NULL;
+ acl_tag_t tag;
+
+ if (r < 0)
+ return -errno;
+ if (r == 0)
+ break;
+
+ if (acl_get_tag_type(entry, &tag) < 0)
+ return -errno;
+
+ if (tag != ACL_GROUP)
+ goto next;
+
+ gid = acl_get_qualifier(entry);
+ if (!gid)
+ return -errno;
+
+ if (in_gid(*gid) > 0) {
+ if (!ret_groups)
+ return true;
+
+ ret = true;
+ }
+
+ if (ret_groups) {
+ char *name;
+
+ name = gid_to_name(*gid);
+ if (!name)
+ return -ENOMEM;
+
+ r = strv_consume(&g, name);
+ if (r < 0)
+ return r;
+ }
+
+ next:
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &entry);
+ }
+
+ if (ret_groups)
+ *ret_groups = TAKE_PTR(g);
+
+ return ret;
+}
+
+int parse_acl(
+ const char *text,
+ acl_t *ret_acl_access,
+ acl_t *ret_acl_access_exec, /* extra rules to apply to inodes subject to uppercase X handling */
+ acl_t *ret_acl_default,
+ bool want_mask) {
+
+ _cleanup_strv_free_ char **a = NULL, **e = NULL, **d = NULL, **split = NULL;
+ _cleanup_(acl_freep) acl_t a_acl = NULL, e_acl = NULL, d_acl = NULL;
+ int r;
+
+ assert(text);
+ assert(ret_acl_access);
+ assert(ret_acl_access_exec);
+ assert(ret_acl_default);
+
+ split = strv_split(text, ",");
+ if (!split)
+ return -ENOMEM;
+
+ STRV_FOREACH(entry, split) {
+ _cleanup_strv_free_ char **entry_split = NULL;
+ _cleanup_free_ char *entry_join = NULL;
+ int n;
+
+ n = strv_split_full(&entry_split, *entry, ":", EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_RETAIN_ESCAPE);
+ if (n < 0)
+ return n;
+
+ if (n < 3 || n > 4)
+ return -EINVAL;
+
+ string_replace_char(entry_split[n-1], 'X', 'x');
+
+ if (n == 4) {
+ if (!STR_IN_SET(entry_split[0], "default", "d"))
+ return -EINVAL;
+
+ entry_join = strv_join(entry_split + 1, ":");
+ if (!entry_join)
+ return -ENOMEM;
+
+ r = strv_consume(&d, TAKE_PTR(entry_join));
+ } else { /* n == 3 */
+ entry_join = strv_join(entry_split, ":");
+ if (!entry_join)
+ return -ENOMEM;
+
+ if (!streq(*entry, entry_join))
+ r = strv_consume(&e, TAKE_PTR(entry_join));
+ else
+ r = strv_consume(&a, TAKE_PTR(entry_join));
+ }
+ if (r < 0)
+ return r;
+ }
+
+ if (!strv_isempty(a)) {
+ _cleanup_free_ char *join = NULL;
+
+ join = strv_join(a, ",");
+ if (!join)
+ return -ENOMEM;
+
+ a_acl = acl_from_text(join);
+ if (!a_acl)
+ return -errno;
+
+ if (want_mask) {
+ r = calc_acl_mask_if_needed(&a_acl);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (!strv_isempty(e)) {
+ _cleanup_free_ char *join = NULL;
+
+ join = strv_join(e, ",");
+ if (!join)
+ return -ENOMEM;
+
+ e_acl = acl_from_text(join);
+ if (!e_acl)
+ return -errno;
+
+ /* The mask must be calculated after deciding whether the execute bit should be set. */
+ }
+
+ if (!strv_isempty(d)) {
+ _cleanup_free_ char *join = NULL;
+
+ join = strv_join(d, ",");
+ if (!join)
+ return -ENOMEM;
+
+ d_acl = acl_from_text(join);
+ if (!d_acl)
+ return -errno;
+
+ if (want_mask) {
+ r = calc_acl_mask_if_needed(&d_acl);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ *ret_acl_access = TAKE_PTR(a_acl);
+ *ret_acl_access_exec = TAKE_PTR(e_acl);
+ *ret_acl_default = TAKE_PTR(d_acl);
+
+ return 0;
+}
+
+static int acl_entry_equal(acl_entry_t a, acl_entry_t b) {
+ acl_tag_t tag_a, tag_b;
+
+ if (acl_get_tag_type(a, &tag_a) < 0)
+ return -errno;
+
+ if (acl_get_tag_type(b, &tag_b) < 0)
+ return -errno;
+
+ if (tag_a != tag_b)
+ return false;
+
+ switch (tag_a) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ /* can have only one of those */
+ return true;
+ case ACL_USER: {
+ _cleanup_(acl_free_uid_tpp) uid_t *uid_a = NULL, *uid_b = NULL;
+
+ uid_a = acl_get_qualifier(a);
+ if (!uid_a)
+ return -errno;
+
+ uid_b = acl_get_qualifier(b);
+ if (!uid_b)
+ return -errno;
+
+ return *uid_a == *uid_b;
+ }
+ case ACL_GROUP: {
+ _cleanup_(acl_free_gid_tpp) gid_t *gid_a = NULL, *gid_b = NULL;
+
+ gid_a = acl_get_qualifier(a);
+ if (!gid_a)
+ return -errno;
+
+ gid_b = acl_get_qualifier(b);
+ if (!gid_b)
+ return -errno;
+
+ return *gid_a == *gid_b;
+ }
+ default:
+ assert_not_reached();
+ }
+}
+
+static int find_acl_entry(acl_t acl, acl_entry_t entry, acl_entry_t *ret) {
+ acl_entry_t i;
+ int r;
+
+ for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
+
+ r = acl_entry_equal(i, entry);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ if (ret)
+ *ret = i;
+ return 0;
+ }
+ }
+ if (r < 0)
+ return -errno;
+
+ return -ENOENT;
+}
+
+int acls_for_file(const char *path, acl_type_t type, acl_t acl, acl_t *ret) {
+ _cleanup_(acl_freep) acl_t applied = NULL;
+ acl_entry_t i;
+ int r;
+
+ assert(path);
+
+ applied = acl_get_file(path, type);
+ if (!applied)
+ return -errno;
+
+ for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
+
+ acl_entry_t j;
+
+ r = find_acl_entry(applied, i, &j);
+ if (r == -ENOENT) {
+ if (acl_create_entry(&applied, &j) < 0)
+ return -errno;
+ } else if (r < 0)
+ return r;
+
+ if (acl_copy_entry(j, i) < 0)
+ return -errno;
+ }
+ if (r < 0)
+ return -errno;
+
+ if (ret)
+ *ret = TAKE_PTR(applied);
+
+ return 0;
+}
+
+/* POSIX says that ACL_{READ,WRITE,EXECUTE} don't have to be bitmasks. But that is a natural thing to do and
+ * all extant implementations do it. Let's make sure that we fail verbosely in the (imho unlikely) scenario
+ * that we get a new implementation that does not satisfy this. */
+assert_cc(!(ACL_READ & ACL_WRITE));
+assert_cc(!(ACL_WRITE & ACL_EXECUTE));
+assert_cc(!(ACL_EXECUTE & ACL_READ));
+assert_cc((unsigned) ACL_READ == ACL_READ);
+assert_cc((unsigned) ACL_WRITE == ACL_WRITE);
+assert_cc((unsigned) ACL_EXECUTE == ACL_EXECUTE);
+
+int fd_add_uid_acl_permission(
+ int fd,
+ uid_t uid,
+ unsigned mask) {
+
+ _cleanup_(acl_freep) acl_t acl = NULL;
+ acl_permset_t permset;
+ acl_entry_t entry;
+ int r;
+
+ /* Adds an ACL entry for the specified file to allow the indicated access to the specified
+ * user. Operates purely incrementally. */
+
+ assert(fd >= 0);
+ assert(uid_is_valid(uid));
+
+ acl = acl_get_fd(fd);
+ if (!acl)
+ return -errno;
+
+ r = acl_find_uid(acl, uid, &entry);
+ if (r <= 0) {
+ if (acl_create_entry(&acl, &entry) < 0 ||
+ acl_set_tag_type(entry, ACL_USER) < 0 ||
+ acl_set_qualifier(entry, &uid) < 0)
+ return -errno;
+ }
+
+ if (acl_get_permset(entry, &permset) < 0)
+ return -errno;
+
+ if ((mask & ACL_READ) && acl_add_perm(permset, ACL_READ) < 0)
+ return -errno;
+ if ((mask & ACL_WRITE) && acl_add_perm(permset, ACL_WRITE) < 0)
+ return -errno;
+ if ((mask & ACL_EXECUTE) && acl_add_perm(permset, ACL_EXECUTE) < 0)
+ return -errno;
+
+ r = calc_acl_mask_if_needed(&acl);
+ if (r < 0)
+ return r;
+
+ if (acl_set_fd(fd, acl) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int fd_acl_make_read_only(int fd) {
+ _cleanup_(acl_freep) acl_t acl = NULL;
+ bool changed = false;
+ acl_entry_t i;
+ int r;
+
+ assert(fd >= 0);
+
+ /* Safely drops all W bits from all relevant ACL entries of the file, without changing entries which
+ * are masked by the ACL mask */
+
+ acl = acl_get_fd(fd);
+ if (!acl) {
+
+ if (!ERRNO_IS_NOT_SUPPORTED(errno))
+ return -errno;
+
+ /* No ACLs? Then just update the regular mode_t */
+ return fd_acl_make_read_only_fallback(fd);
+ }
+
+ for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
+ acl_permset_t permset;
+ acl_tag_t tag;
+ int b;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ /* These three control the x bits overall (as ACL_MASK affects all remaining tags) */
+ if (!IN_SET(tag, ACL_USER_OBJ, ACL_MASK, ACL_OTHER))
+ continue;
+
+ if (acl_get_permset(i, &permset) < 0)
+ return -errno;
+
+ b = acl_get_perm(permset, ACL_WRITE);
+ if (b < 0)
+ return -errno;
+
+ if (b) {
+ if (acl_delete_perm(permset, ACL_WRITE) < 0)
+ return -errno;
+
+ changed = true;
+ }
+ }
+ if (r < 0)
+ return -errno;
+
+ if (!changed)
+ return 0;
+
+ if (acl_set_fd(fd, acl) < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(errno))
+ return -errno;
+
+ return fd_acl_make_read_only_fallback(fd);
+ }
+
+ return 1;
+}
+
+int fd_acl_make_writable(int fd) {
+ _cleanup_(acl_freep) acl_t acl = NULL;
+ acl_entry_t i;
+ int r;
+
+ /* Safely adds the writable bit to the owner's ACL entry of this inode. (And only the owner's! – This
+ * not the obvious inverse of fd_acl_make_read_only() hence!) */
+
+ acl = acl_get_fd(fd);
+ if (!acl) {
+ if (!ERRNO_IS_NOT_SUPPORTED(errno))
+ return -errno;
+
+ /* No ACLs? Then just update the regular mode_t */
+ return fd_acl_make_writable_fallback(fd);
+ }
+
+ for (r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ r > 0;
+ r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
+ acl_permset_t permset;
+ acl_tag_t tag;
+ int b;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if (tag != ACL_USER_OBJ)
+ continue;
+
+ if (acl_get_permset(i, &permset) < 0)
+ return -errno;
+
+ b = acl_get_perm(permset, ACL_WRITE);
+ if (b < 0)
+ return -errno;
+
+ if (b)
+ return 0; /* Already set? Then there's nothing to do. */
+
+ if (acl_add_perm(permset, ACL_WRITE) < 0)
+ return -errno;
+
+ break;
+ }
+ if (r < 0)
+ return -errno;
+
+ if (acl_set_fd(fd, acl) < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(errno))
+ return -errno;
+
+ return fd_acl_make_writable_fallback(fd);
+ }
+
+ return 1;
+}
+#endif
+
+int fd_acl_make_read_only_fallback(int fd) {
+ struct stat st;
+
+ assert(fd >= 0);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if ((st.st_mode & 0222) == 0)
+ return 0;
+
+ if (fchmod(fd, st.st_mode & 0555) < 0)
+ return -errno;
+
+ return 1;
+}
+
+int fd_acl_make_writable_fallback(int fd) {
+ struct stat st;
+
+ assert(fd >= 0);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if ((st.st_mode & 0200) != 0) /* already set */
+ return 0;
+
+ if (fchmod(fd, (st.st_mode & 07777) | 0200) < 0)
+ return -errno;
+
+ return 1;
+}
diff --git a/src/shared/acl-util.h b/src/shared/acl-util.h
new file mode 100644
index 0000000..ef315c2
--- /dev/null
+++ b/src/shared/acl-util.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <unistd.h>
+
+int fd_acl_make_read_only_fallback(int fd);
+int fd_acl_make_writable_fallback(int fd);
+
+#if HAVE_ACL
+#include <acl/libacl.h>
+#include <stdbool.h>
+#include <sys/acl.h>
+
+#include "macro.h"
+
+int acl_find_uid(acl_t acl, uid_t uid, acl_entry_t *entry);
+int calc_acl_mask_if_needed(acl_t *acl_p);
+int add_base_acls_if_needed(acl_t *acl_p, const char *path);
+int acl_search_groups(const char* path, char ***ret_groups);
+int parse_acl(
+ const char *text,
+ acl_t *ret_acl_access,
+ acl_t *ret_acl_access_exec,
+ acl_t *ret_acl_default,
+ bool want_mask);
+int acls_for_file(const char *path, acl_type_t type, acl_t new, acl_t *ret);
+int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask);
+
+int fd_acl_make_read_only(int fd);
+int fd_acl_make_writable(int fd);
+
+/* acl_free takes multiple argument types.
+ * Multiple cleanup functions are necessary. */
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(acl_t, acl_free, NULL);
+#define acl_free_charp acl_free
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(char*, acl_free_charp, NULL);
+#define acl_free_uid_tp acl_free
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(uid_t*, acl_free_uid_tp, NULL);
+#define acl_free_gid_tp acl_free
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(gid_t*, acl_free_gid_tp, NULL);
+
+#else
+#define ACL_READ 0x04
+#define ACL_WRITE 0x02
+#define ACL_EXECUTE 0x01
+
+static inline int fd_add_uid_acl_permission(int fd, uid_t uid, unsigned mask) {
+ return -EOPNOTSUPP;
+}
+
+static inline int fd_acl_make_read_only(int fd) {
+ return fd_acl_make_read_only_fallback(fd);
+}
+
+static inline int fd_acl_make_writable(int fd) {
+ return fd_acl_make_writable_fallback(fd);
+}
+
+#endif
diff --git a/src/shared/acpi-fpdt.c b/src/shared/acpi-fpdt.c
new file mode 100644
index 0000000..22a36bd
--- /dev/null
+++ b/src/shared/acpi-fpdt.c
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "acpi-fpdt.h"
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "time-util.h"
+
+struct acpi_table_header {
+ char signature[4];
+ uint32_t length;
+ uint8_t revision;
+ uint8_t checksum;
+ char oem_id[6];
+ char oem_table_id[8];
+ uint32_t oem_revision;
+ char asl_compiler_id[4];
+ uint32_t asl_compiler_revision;
+} _packed_;
+
+enum {
+ ACPI_FPDT_TYPE_BOOT = 0,
+ ACPI_FPDT_TYPE_S3PERF = 1,
+};
+
+struct acpi_fpdt_header {
+ uint16_t type;
+ uint8_t length;
+ uint8_t revision;
+ uint8_t reserved[4];
+ uint64_t ptr;
+} _packed_;
+
+struct acpi_fpdt_boot_header {
+ char signature[4];
+ uint32_t length;
+} _packed_;
+
+enum {
+ ACPI_FPDT_S3PERF_RESUME_REC = 0,
+ ACPI_FPDT_S3PERF_SUSPEND_REC = 1,
+ ACPI_FPDT_BOOT_REC = 2,
+};
+
+struct acpi_fpdt_boot {
+ uint16_t type;
+ uint8_t length;
+ uint8_t revision;
+ uint8_t reserved[4];
+ uint64_t reset_end;
+ uint64_t load_start;
+ uint64_t startup_start;
+ uint64_t exit_services_entry;
+ uint64_t exit_services_exit;
+} _packed;
+
+/* /dev/mem is deprecated on many systems, try using /sys/firmware/acpi/fpdt parsing instead.
+ * This code requires kernel version 5.12 on x86 based machines or 6.2 for arm64 */
+static int acpi_get_boot_usec_kernel_parsed(usec_t *ret_loader_start, usec_t *ret_loader_exit) {
+ usec_t start, end;
+ int r;
+
+ r = read_timestamp_file("/sys/firmware/acpi/fpdt/boot/exitbootservice_end_ns", &end);
+ if (r < 0)
+ return r;
+
+ if (end == 0)
+ /* Non-UEFI compatible boot. */
+ return -ENODATA;
+
+ r = read_timestamp_file("/sys/firmware/acpi/fpdt/boot/bootloader_launch_ns", &start);
+ if (r < 0)
+ return r;
+
+ if (start == 0 || end < start)
+ return -EINVAL;
+ if (end > NSEC_PER_HOUR)
+ return -EINVAL;
+
+ if (ret_loader_start)
+ *ret_loader_start = start / 1000;
+ if (ret_loader_exit)
+ *ret_loader_exit = end / 1000;
+
+ return 0;
+}
+
+int acpi_get_boot_usec(usec_t *ret_loader_start, usec_t *ret_loader_exit) {
+ _cleanup_free_ char *buf = NULL;
+ struct acpi_table_header *tbl;
+ size_t l;
+ ssize_t ll;
+ struct acpi_fpdt_header *rec;
+ int r;
+ uint64_t ptr = 0;
+ _cleanup_close_ int fd = -EBADF;
+ struct acpi_fpdt_boot_header hbrec;
+ struct acpi_fpdt_boot brec;
+
+ r = acpi_get_boot_usec_kernel_parsed(ret_loader_start, ret_loader_exit);
+ if (r != -ENOENT) /* fallback to /dev/mem hack only if kernel doesn't support the new sysfs files */
+ return r;
+
+ r = read_full_virtual_file("/sys/firmware/acpi/tables/FPDT", &buf, &l);
+ if (r < 0)
+ return r;
+
+ if (l < sizeof(struct acpi_table_header) + sizeof(struct acpi_fpdt_header))
+ return -EINVAL;
+
+ tbl = (struct acpi_table_header *)buf;
+ if (l != tbl->length)
+ return -EINVAL;
+
+ if (memcmp(tbl->signature, "FPDT", 4) != 0)
+ return -EINVAL;
+
+ /* find Firmware Basic Boot Performance Pointer Record */
+ for (rec = (struct acpi_fpdt_header *)(buf + sizeof(struct acpi_table_header));
+ (char *)rec + offsetof(struct acpi_fpdt_header, revision) <= buf + l;
+ rec = (struct acpi_fpdt_header *)((char *)rec + rec->length)) {
+ if (rec->length <= 0)
+ break;
+ if (rec->type != ACPI_FPDT_TYPE_BOOT)
+ continue;
+ if (rec->length != sizeof(struct acpi_fpdt_header))
+ continue;
+
+ ptr = rec->ptr;
+ break;
+ }
+
+ if (ptr == 0)
+ return -ENODATA;
+
+ /* read Firmware Basic Boot Performance Data Record */
+ fd = open("/dev/mem", O_CLOEXEC|O_RDONLY);
+ if (fd < 0)
+ return -errno;
+
+ ll = pread(fd, &hbrec, sizeof(struct acpi_fpdt_boot_header), ptr);
+ if (ll < 0)
+ return -errno;
+ if ((size_t) ll != sizeof(struct acpi_fpdt_boot_header))
+ return -EINVAL;
+
+ if (memcmp(hbrec.signature, "FBPT", 4) != 0)
+ return -EINVAL;
+
+ if (hbrec.length < sizeof(struct acpi_fpdt_boot_header) + sizeof(struct acpi_fpdt_boot))
+ return -EINVAL;
+
+ ll = pread(fd, &brec, sizeof(struct acpi_fpdt_boot), ptr + sizeof(struct acpi_fpdt_boot_header));
+ if (ll < 0)
+ return -errno;
+ if ((size_t) ll != sizeof(struct acpi_fpdt_boot))
+ return -EINVAL;
+
+ if (brec.length != sizeof(struct acpi_fpdt_boot))
+ return -EINVAL;
+
+ if (brec.type != ACPI_FPDT_BOOT_REC)
+ return -EINVAL;
+
+ if (brec.exit_services_exit == 0)
+ /* Non-UEFI compatible boot. */
+ return -ENODATA;
+
+ if (brec.startup_start == 0 || brec.exit_services_exit < brec.startup_start)
+ return -EINVAL;
+ if (brec.exit_services_exit > NSEC_PER_HOUR)
+ return -EINVAL;
+
+ if (ret_loader_start)
+ *ret_loader_start = brec.startup_start / 1000;
+ if (ret_loader_exit)
+ *ret_loader_exit = brec.exit_services_exit / 1000;
+
+ return 0;
+}
diff --git a/src/shared/acpi-fpdt.h b/src/shared/acpi-fpdt.h
new file mode 100644
index 0000000..56f8c9e
--- /dev/null
+++ b/src/shared/acpi-fpdt.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <time-util.h>
+
+int acpi_get_boot_usec(usec_t *ret_loader_start, usec_t *ret_loader_exit);
diff --git a/src/shared/apparmor-util.c b/src/shared/apparmor-util.c
new file mode 100644
index 0000000..68e1c55
--- /dev/null
+++ b/src/shared/apparmor-util.c
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stddef.h>
+
+#include "alloc-util.h"
+#include "apparmor-util.h"
+#include "fileio.h"
+#include "parse-util.h"
+
+bool mac_apparmor_use(void) {
+ static int cached_use = -1;
+
+ if (cached_use < 0) {
+ _cleanup_free_ char *p = NULL;
+
+ cached_use =
+ read_one_line_file("/sys/module/apparmor/parameters/enabled", &p) >= 0 &&
+ parse_boolean(p) > 0;
+ }
+
+ return cached_use;
+}
diff --git a/src/shared/apparmor-util.h b/src/shared/apparmor-util.h
new file mode 100644
index 0000000..8007aeb
--- /dev/null
+++ b/src/shared/apparmor-util.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+bool mac_apparmor_use(void);
diff --git a/src/shared/ask-password-api.c b/src/shared/ask-password-api.c
new file mode 100644
index 0000000..0e323f4
--- /dev/null
+++ b/src/shared/ask-password-api.c
@@ -0,0 +1,1002 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/inotify.h>
+#include <sys/signalfd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/un.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "creds-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "io-util.h"
+#include "iovec-util.h"
+#include "keyring-util.h"
+#include "log.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "nulstr-util.h"
+#include "plymouth-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "utf8.h"
+
+#define KEYRING_TIMEOUT_USEC ((5 * USEC_PER_MINUTE) / 2)
+
+static int lookup_key(const char *keyname, key_serial_t *ret) {
+ key_serial_t serial;
+
+ assert(keyname);
+ assert(ret);
+
+ serial = request_key("user", keyname, NULL, 0);
+ if (serial == -1)
+ return negative_errno();
+
+ *ret = serial;
+ return 0;
+}
+
+static int retrieve_key(key_serial_t serial, char ***ret) {
+ _cleanup_(erase_and_freep) void *p = NULL;
+ char **l;
+ size_t n;
+ int r;
+
+ assert(ret);
+
+ r = keyring_read(serial, &p, &n);
+ if (r < 0)
+ return r;
+
+ l = strv_parse_nulstr(p, n);
+ if (!l)
+ return -ENOMEM;
+
+ *ret = l;
+ return 0;
+}
+
+static int add_to_keyring(const char *keyname, AskPasswordFlags flags, char **passwords) {
+ _cleanup_strv_free_erase_ char **l = NULL;
+ _cleanup_(erase_and_freep) char *p = NULL;
+ key_serial_t serial;
+ size_t n;
+ int r;
+
+ assert(keyname);
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_PUSH_CACHE))
+ return 0;
+ if (strv_isempty(passwords))
+ return 0;
+
+ r = lookup_key(keyname, &serial);
+ if (r >= 0) {
+ r = retrieve_key(serial, &l);
+ if (r < 0)
+ return r;
+ } else if (r != -ENOKEY)
+ return r;
+
+ r = strv_extend_strv(&l, passwords, true);
+ if (r <= 0)
+ return r;
+
+ r = strv_make_nulstr(l, &p, &n);
+ if (r < 0)
+ return r;
+
+ /* chop off the final NUL byte. We do this because we want to use the separator NUL bytes only if we
+ * have multiple passwords. */
+ n = LESS_BY(n, (size_t) 1);
+
+ serial = add_key("user", keyname, p, n, KEY_SPEC_USER_KEYRING);
+ if (serial == -1)
+ return -errno;
+
+ if (keyctl(KEYCTL_SET_TIMEOUT,
+ (unsigned long) serial,
+ (unsigned long) DIV_ROUND_UP(KEYRING_TIMEOUT_USEC, USEC_PER_SEC), 0, 0) < 0)
+ log_debug_errno(errno, "Failed to adjust kernel keyring key timeout: %m");
+
+ /* Tell everyone to check the keyring */
+ (void) touch("/run/systemd/ask-password");
+
+ log_debug("Added key to kernel keyring as %" PRIi32 ".", serial);
+
+ return 1;
+}
+
+static int add_to_keyring_and_log(const char *keyname, AskPasswordFlags flags, char **passwords) {
+ int r;
+
+ assert(keyname);
+
+ r = add_to_keyring(keyname, flags, passwords);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add password to kernel keyring: %m");
+
+ return 0;
+}
+
+static int ask_password_keyring(const char *keyname, AskPasswordFlags flags, char ***ret) {
+
+ key_serial_t serial;
+ int r;
+
+ assert(keyname);
+ assert(ret);
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED))
+ return -EUNATCH;
+
+ r = lookup_key(keyname, &serial);
+ if (ERRNO_IS_NEG_NOT_SUPPORTED(r) || r == -EPERM)
+ /* When retrieving, the distinction between "kernel or container manager don't support or
+ * allow this" and "no matching key known" doesn't matter. Note that we propagate EACCESS
+ * here (even if EPERM not) since that is used if the keyring is available, but we lack
+ * access to the key. */
+ return -ENOKEY;
+ if (r < 0)
+ return r;
+
+ return retrieve_key(serial, ret);
+}
+
+static int backspace_chars(int ttyfd, size_t p) {
+ if (ttyfd < 0)
+ return 0;
+
+ _cleanup_free_ char *buf = malloc_multiply(3, p);
+ if (!buf)
+ return log_oom();
+
+ for (size_t i = 0; i < p; i++)
+ memcpy(buf + 3 * i, "\b \b", 3);
+
+ return loop_write(ttyfd, buf, 3 * p);
+}
+
+static int backspace_string(int ttyfd, const char *str) {
+ assert(str);
+
+ /* Backspaces through enough characters to entirely undo printing of the specified string. */
+
+ if (ttyfd < 0)
+ return 0;
+
+ size_t m = utf8_n_codepoints(str);
+ if (m == SIZE_MAX)
+ m = strlen(str); /* Not a valid UTF-8 string? If so, let's backspace the number of bytes
+ * output. Most likely this happened because we are not in a UTF-8 locale,
+ * and in that case that is the correct thing to do. And even if it's not,
+ * terminals tend to stop backspacing at the leftmost column, hence
+ * backspacing too much should be mostly OK. */
+
+ return backspace_chars(ttyfd, m);
+}
+
+int ask_password_plymouth(
+ const char *message,
+ usec_t until,
+ AskPasswordFlags flags,
+ const char *flag_file,
+ char ***ret) {
+
+ _cleanup_close_ int fd = -EBADF, notify = -EBADF;
+ _cleanup_free_ char *packet = NULL;
+ ssize_t k;
+ int r, n;
+ struct pollfd pollfd[2] = {};
+ char buffer[LINE_MAX];
+ size_t p = 0;
+ enum {
+ POLL_SOCKET,
+ POLL_INOTIFY
+ };
+
+ assert(ret);
+
+ if (!message)
+ message = "Password:";
+
+ if (flag_file) {
+ notify = inotify_init1(IN_CLOEXEC|IN_NONBLOCK);
+ if (notify < 0)
+ return -errno;
+
+ if (inotify_add_watch(notify, flag_file, IN_ATTRIB) < 0) /* for the link count */
+ return -errno;
+ }
+
+ fd = plymouth_connect(SOCK_NONBLOCK);
+ if (fd < 0)
+ return fd;
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED)) {
+ packet = strdup("c");
+ n = 1;
+ } else if (asprintf(&packet, "*\002%c%s%n", (int) (strlen(message) + 1), message, &n) < 0)
+ packet = NULL;
+ if (!packet)
+ return -ENOMEM;
+
+ r = loop_write_full(fd, packet, n + 1, USEC_INFINITY);
+ if (r < 0)
+ return r;
+
+ CLEANUP_ERASE(buffer);
+
+ pollfd[POLL_SOCKET].fd = fd;
+ pollfd[POLL_SOCKET].events = POLLIN;
+ pollfd[POLL_INOTIFY].fd = notify;
+ pollfd[POLL_INOTIFY].events = POLLIN;
+
+ for (;;) {
+ usec_t timeout;
+
+ if (until > 0)
+ timeout = usec_sub_unsigned(until, now(CLOCK_MONOTONIC));
+ else
+ timeout = USEC_INFINITY;
+
+ if (flag_file && access(flag_file, F_OK) < 0)
+ return -errno;
+
+ r = ppoll_usec(pollfd, notify >= 0 ? 2 : 1, timeout);
+ if (r == -EINTR)
+ continue;
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ETIME;
+
+ if (notify >= 0 && pollfd[POLL_INOTIFY].revents != 0)
+ (void) flush_fd(notify);
+
+ if (pollfd[POLL_SOCKET].revents == 0)
+ continue;
+
+ k = read(fd, buffer + p, sizeof(buffer) - p);
+ if (k < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ continue;
+
+ return -errno;
+ }
+ if (k == 0)
+ return -EIO;
+
+ p += k;
+
+ if (buffer[0] == 5) {
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED)) {
+ /* Hmm, first try with cached
+ * passwords failed, so let's retry
+ * with a normal password request */
+ packet = mfree(packet);
+
+ if (asprintf(&packet, "*\002%c%s%n", (int) (strlen(message) + 1), message, &n) < 0)
+ return -ENOMEM;
+
+ r = loop_write_full(fd, packet, n + 1, USEC_INFINITY);
+ if (r < 0)
+ return r;
+
+ flags &= ~ASK_PASSWORD_ACCEPT_CACHED;
+ p = 0;
+ continue;
+ }
+
+ /* No password, because UI not shown */
+ return -ENOENT;
+
+ } else if (IN_SET(buffer[0], 2, 9)) {
+ uint32_t size;
+ char **l;
+
+ /* One or more answers */
+ if (p < 5)
+ continue;
+
+ memcpy(&size, buffer+1, sizeof(size));
+ size = le32toh(size);
+ if (size + 5 > sizeof(buffer))
+ return -EIO;
+
+ if (p-5 < size)
+ continue;
+
+ l = strv_parse_nulstr(buffer + 5, size);
+ if (!l)
+ return -ENOMEM;
+
+ *ret = l;
+ break;
+
+ } else
+ /* Unknown packet */
+ return -EIO;
+ }
+
+ return 0;
+}
+
+#define NO_ECHO "(no echo) "
+#define PRESS_TAB "(press TAB for no echo) "
+#define SKIPPED "(skipped)"
+
+int ask_password_tty(
+ int ttyfd,
+ const char *message,
+ const char *keyname,
+ usec_t until,
+ AskPasswordFlags flags,
+ const char *flag_file,
+ char ***ret) {
+
+ enum {
+ POLL_TTY,
+ POLL_INOTIFY,
+ _POLL_MAX,
+ };
+
+ bool reset_tty = false, dirty = false, use_color = false, press_tab_visible = false;
+ _cleanup_close_ int cttyfd = -EBADF, notify = -EBADF;
+ struct termios old_termios, new_termios;
+ char passphrase[LINE_MAX + 1] = {}, *x;
+ _cleanup_strv_free_erase_ char **l = NULL;
+ struct pollfd pollfd[_POLL_MAX];
+ size_t p = 0, codepoint = 0;
+ int r;
+
+ assert(ret);
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_NO_TTY))
+ return -EUNATCH;
+
+ if (!message)
+ message = "Password:";
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_HIDE_EMOJI) && emoji_enabled())
+ message = strjoina(special_glyph(SPECIAL_GLYPH_LOCK_AND_KEY), " ", message);
+
+ if (flag_file || (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && keyname)) {
+ notify = inotify_init1(IN_CLOEXEC|IN_NONBLOCK);
+ if (notify < 0)
+ return -errno;
+ }
+ if (flag_file) {
+ if (inotify_add_watch(notify, flag_file, IN_ATTRIB /* for the link count */) < 0)
+ return -errno;
+ }
+ if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && keyname) {
+ r = ask_password_keyring(keyname, flags, ret);
+ if (r >= 0)
+ return 0;
+ else if (r != -ENOKEY)
+ return r;
+
+ if (inotify_add_watch(notify, "/run/systemd/ask-password", IN_ATTRIB /* for mtime */) < 0)
+ return -errno;
+ }
+
+ CLEANUP_ERASE(passphrase);
+
+ /* If the caller didn't specify a TTY, then use the controlling tty, if we can. */
+ if (ttyfd < 0)
+ ttyfd = cttyfd = open("/dev/tty", O_RDWR|O_NOCTTY|O_CLOEXEC);
+
+ if (ttyfd >= 0) {
+ if (tcgetattr(ttyfd, &old_termios) < 0)
+ return -errno;
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_CONSOLE_COLOR))
+ use_color = dev_console_colors_enabled();
+ else
+ use_color = colors_enabled();
+
+ if (use_color)
+ (void) loop_write(ttyfd, ANSI_HIGHLIGHT, SIZE_MAX);
+
+ (void) loop_write(ttyfd, message, SIZE_MAX);
+ (void) loop_write(ttyfd, " ", 1);
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT) && !FLAGS_SET(flags, ASK_PASSWORD_ECHO)) {
+ if (use_color)
+ (void) loop_write(ttyfd, ansi_grey(), SIZE_MAX);
+
+ (void) loop_write(ttyfd, PRESS_TAB, SIZE_MAX);
+ press_tab_visible = true;
+ }
+
+ if (use_color)
+ (void) loop_write(ttyfd, ANSI_NORMAL, SIZE_MAX);
+
+ new_termios = old_termios;
+ new_termios.c_lflag &= ~(ICANON|ECHO);
+ new_termios.c_cc[VMIN] = 1;
+ new_termios.c_cc[VTIME] = 0;
+
+ r = RET_NERRNO(tcsetattr(ttyfd, TCSADRAIN, &new_termios));
+ if (r < 0)
+ goto finish;
+
+ reset_tty = true;
+ }
+
+ pollfd[POLL_TTY] = (struct pollfd) {
+ .fd = ttyfd >= 0 ? ttyfd : STDIN_FILENO,
+ .events = POLLIN,
+ };
+ pollfd[POLL_INOTIFY] = (struct pollfd) {
+ .fd = notify,
+ .events = POLLIN,
+ };
+
+ for (;;) {
+ _cleanup_(erase_char) char c;
+ usec_t timeout;
+ ssize_t n;
+
+ if (until > 0)
+ timeout = usec_sub_unsigned(until, now(CLOCK_MONOTONIC));
+ else
+ timeout = USEC_INFINITY;
+
+ if (flag_file) {
+ r = RET_NERRNO(access(flag_file, F_OK));
+ if (r < 0)
+ goto finish;
+ }
+
+ r = ppoll_usec(pollfd, notify >= 0 ? 2 : 1, timeout);
+ if (r == -EINTR)
+ continue;
+ if (r < 0)
+ goto finish;
+ if (r == 0) {
+ r = -ETIME;
+ goto finish;
+ }
+
+ if (notify >= 0 && pollfd[POLL_INOTIFY].revents != 0 && keyname) {
+ (void) flush_fd(notify);
+
+ r = ask_password_keyring(keyname, flags, ret);
+ if (r >= 0) {
+ r = 0;
+ goto finish;
+ } else if (r != -ENOKEY)
+ goto finish;
+ }
+
+ if (pollfd[POLL_TTY].revents == 0)
+ continue;
+
+ n = read(ttyfd >= 0 ? ttyfd : STDIN_FILENO, &c, 1);
+ if (n < 0) {
+ if (ERRNO_IS_TRANSIENT(errno))
+ continue;
+
+ r = -errno;
+ goto finish;
+
+ }
+
+ if (press_tab_visible) {
+ assert(ttyfd >= 0);
+ backspace_chars(ttyfd, strlen(PRESS_TAB));
+ press_tab_visible = false;
+ }
+
+ /* We treat EOF, newline and NUL byte all as valid end markers */
+ if (n == 0 || c == '\n' || c == 0)
+ break;
+
+ if (c == 4) { /* C-d also known as EOT */
+ if (ttyfd >= 0)
+ (void) loop_write(ttyfd, SKIPPED, SIZE_MAX);
+
+ goto skipped;
+ }
+
+ if (c == 21) { /* C-u */
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT))
+ (void) backspace_string(ttyfd, passphrase);
+
+ explicit_bzero_safe(passphrase, sizeof(passphrase));
+ p = codepoint = 0;
+
+ } else if (IN_SET(c, '\b', 127)) {
+
+ if (p > 0) {
+ size_t q;
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT))
+ (void) backspace_chars(ttyfd, 1);
+
+ /* Remove a full UTF-8 codepoint from the end. For that, figure out where the
+ * last one begins */
+ q = 0;
+ for (;;) {
+ int z;
+
+ z = utf8_encoded_valid_unichar(passphrase + q, SIZE_MAX);
+ if (z <= 0) {
+ q = SIZE_MAX; /* Invalid UTF8! */
+ break;
+ }
+
+ if (q + z >= p) /* This one brings us over the edge */
+ break;
+
+ q += z;
+ }
+
+ p = codepoint = q == SIZE_MAX ? p - 1 : q;
+ explicit_bzero_safe(passphrase + p, sizeof(passphrase) - p);
+
+ } else if (!dirty && !FLAGS_SET(flags, ASK_PASSWORD_SILENT)) {
+
+ flags |= ASK_PASSWORD_SILENT;
+
+ /* There are two ways to enter silent mode. Either by pressing backspace as
+ * first key (and only as first key), or ... */
+
+ if (ttyfd >= 0)
+ (void) loop_write(ttyfd, NO_ECHO, SIZE_MAX);
+
+ } else if (ttyfd >= 0)
+ (void) loop_write(ttyfd, "\a", 1);
+
+ } else if (c == '\t' && !FLAGS_SET(flags, ASK_PASSWORD_SILENT)) {
+
+ (void) backspace_string(ttyfd, passphrase);
+ flags |= ASK_PASSWORD_SILENT;
+
+ /* ... or by pressing TAB at any time. */
+
+ if (ttyfd >= 0)
+ (void) loop_write(ttyfd, NO_ECHO, SIZE_MAX);
+
+ } else if (p >= sizeof(passphrase)-1) {
+
+ /* Reached the size limit */
+ if (ttyfd >= 0)
+ (void) loop_write(ttyfd, "\a", 1);
+
+ } else {
+ passphrase[p++] = c;
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_SILENT) && ttyfd >= 0) {
+ /* Check if we got a complete UTF-8 character now. If so, let's output one '*'. */
+ n = utf8_encoded_valid_unichar(passphrase + codepoint, SIZE_MAX);
+ if (n >= 0) {
+ if (FLAGS_SET(flags, ASK_PASSWORD_ECHO))
+ (void) loop_write(ttyfd, passphrase + codepoint, n);
+ else
+ (void) loop_write(ttyfd,
+ special_glyph(SPECIAL_GLYPH_BULLET),
+ SIZE_MAX);
+ codepoint = p;
+ }
+ }
+
+ dirty = true;
+ }
+ }
+
+ x = strndup(passphrase, p);
+ if (!x) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ r = strv_consume(&l, x);
+ if (r < 0)
+ goto finish;
+
+skipped:
+ if (strv_isempty(l))
+ r = log_debug_errno(SYNTHETIC_ERRNO(ECANCELED), "Password query was cancelled.");
+ else {
+ if (keyname)
+ (void) add_to_keyring_and_log(keyname, flags, l);
+
+ *ret = TAKE_PTR(l);
+ r = 0;
+ }
+
+finish:
+ if (ttyfd >= 0 && reset_tty) {
+ (void) loop_write(ttyfd, "\n", 1);
+ (void) tcsetattr(ttyfd, TCSADRAIN, &old_termios);
+ }
+
+ return r;
+}
+
+static int create_socket(char **ret) {
+ _cleanup_free_ char *path = NULL;
+ union sockaddr_union sa;
+ socklen_t sa_len;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(ret);
+
+ fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return -errno;
+
+ if (asprintf(&path, "/run/systemd/ask-password/sck.%" PRIx64, random_u64()) < 0)
+ return -ENOMEM;
+
+ r = sockaddr_un_set_path(&sa.un, path);
+ if (r < 0)
+ return r;
+ sa_len = r;
+
+ WITH_UMASK(0177)
+ if (bind(fd, &sa.sa, sa_len) < 0)
+ return -errno;
+
+ r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(path);
+ return TAKE_FD(fd);
+}
+
+int ask_password_agent(
+ const char *message,
+ const char *icon,
+ const char *id,
+ const char *keyname,
+ usec_t until,
+ AskPasswordFlags flags,
+ char ***ret) {
+
+ enum {
+ FD_SOCKET,
+ FD_SIGNAL,
+ FD_INOTIFY,
+ _FD_MAX
+ };
+
+ _cleanup_close_ int socket_fd = -EBADF, signal_fd = -EBADF, notify = -EBADF, fd = -EBADF;
+ char temp[] = "/run/systemd/ask-password/tmp.XXXXXX";
+ char final[sizeof(temp)] = "";
+ _cleanup_free_ char *socket_name = NULL;
+ _cleanup_strv_free_erase_ char **l = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ struct pollfd pollfd[_FD_MAX];
+ sigset_t mask, oldmask;
+ int r;
+
+ assert(ret);
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_NO_AGENT))
+ return -EUNATCH;
+
+ assert_se(sigemptyset(&mask) >= 0);
+ assert_se(sigset_add_many(&mask, SIGINT, SIGTERM, -1) >= 0);
+ assert_se(sigprocmask(SIG_BLOCK, &mask, &oldmask) >= 0);
+
+ (void) mkdir_p_label("/run/systemd/ask-password", 0755);
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) && keyname) {
+ r = ask_password_keyring(keyname, flags, ret);
+ if (r >= 0) {
+ r = 0;
+ goto finish;
+ } else if (r != -ENOKEY)
+ goto finish;
+
+ notify = inotify_init1(IN_CLOEXEC | IN_NONBLOCK);
+ if (notify < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ r = RET_NERRNO(inotify_add_watch(notify, "/run/systemd/ask-password", IN_ATTRIB /* for mtime */));
+ if (r < 0)
+ goto finish;
+ }
+
+ fd = mkostemp_safe(temp);
+ if (fd < 0) {
+ r = fd;
+ goto finish;
+ }
+
+ (void) fchmod(fd, 0644);
+
+ f = take_fdopen(&fd, "w");
+ if (!f) {
+ r = -errno;
+ goto finish;
+ }
+
+ signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC);
+ if (signal_fd < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ socket_fd = create_socket(&socket_name);
+ if (socket_fd < 0) {
+ r = socket_fd;
+ goto finish;
+ }
+
+ fprintf(f,
+ "[Ask]\n"
+ "PID="PID_FMT"\n"
+ "Socket=%s\n"
+ "AcceptCached=%i\n"
+ "Echo=%i\n"
+ "NotAfter="USEC_FMT"\n"
+ "Silent=%i\n",
+ getpid_cached(),
+ socket_name,
+ FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED),
+ FLAGS_SET(flags, ASK_PASSWORD_ECHO),
+ until,
+ FLAGS_SET(flags, ASK_PASSWORD_SILENT));
+
+ if (message)
+ fprintf(f, "Message=%s\n", message);
+
+ if (icon)
+ fprintf(f, "Icon=%s\n", icon);
+
+ if (id)
+ fprintf(f, "Id=%s\n", id);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ goto finish;
+
+ memcpy(final, temp, sizeof(temp));
+
+ final[sizeof(final)-11] = 'a';
+ final[sizeof(final)-10] = 's';
+ final[sizeof(final)-9] = 'k';
+
+ r = RET_NERRNO(rename(temp, final));
+ if (r < 0)
+ goto finish;
+
+ zero(pollfd);
+ pollfd[FD_SOCKET].fd = socket_fd;
+ pollfd[FD_SOCKET].events = POLLIN;
+ pollfd[FD_SIGNAL].fd = signal_fd;
+ pollfd[FD_SIGNAL].events = POLLIN;
+ pollfd[FD_INOTIFY].fd = notify;
+ pollfd[FD_INOTIFY].events = POLLIN;
+
+ for (;;) {
+ CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control;
+ char passphrase[LINE_MAX+1];
+ struct iovec iovec;
+ struct ucred *ucred;
+ usec_t timeout;
+ ssize_t n;
+
+ if (until > 0)
+ timeout = usec_sub_unsigned(until, now(CLOCK_MONOTONIC));
+ else
+ timeout = USEC_INFINITY;
+
+ r = ppoll_usec(pollfd, notify >= 0 ? _FD_MAX : _FD_MAX - 1, timeout);
+ if (r == -EINTR)
+ continue;
+ if (r < 0)
+ goto finish;
+ if (r == 0) {
+ r = -ETIME;
+ goto finish;
+ }
+
+ if (pollfd[FD_SIGNAL].revents & POLLIN) {
+ r = -EINTR;
+ goto finish;
+ }
+
+ if (notify >= 0 && pollfd[FD_INOTIFY].revents != 0) {
+ (void) flush_fd(notify);
+
+ r = ask_password_keyring(keyname, flags, ret);
+ if (r >= 0) {
+ r = 0;
+ goto finish;
+ } else if (r != -ENOKEY)
+ goto finish;
+ }
+
+ if (pollfd[FD_SOCKET].revents == 0)
+ continue;
+
+ if (pollfd[FD_SOCKET].revents != POLLIN) {
+ r = -EIO;
+ goto finish;
+ }
+
+ iovec = IOVEC_MAKE(passphrase, sizeof(passphrase));
+
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ .msg_control = &control,
+ .msg_controllen = sizeof(control),
+ };
+
+ n = recvmsg_safe(socket_fd, &msghdr, 0);
+ if (ERRNO_IS_NEG_TRANSIENT(n))
+ continue;
+ else if (n == -EXFULL) {
+ log_debug("Got message with truncated control data, ignoring.");
+ continue;
+ } else if (n < 0) {
+ r = (int) n;
+ goto finish;
+ }
+
+ CLEANUP_ERASE(passphrase);
+
+ cmsg_close_all(&msghdr);
+
+ if (n == 0) {
+ log_debug("Message too short");
+ continue;
+ }
+
+ ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
+ if (!ucred) {
+ log_debug("Received message without credentials. Ignoring.");
+ continue;
+ }
+
+ if (ucred->uid != 0) {
+ log_debug("Got request from unprivileged user. Ignoring.");
+ continue;
+ }
+
+ if (passphrase[0] == '+') {
+ /* An empty message refers to the empty password */
+ if (n == 1)
+ l = strv_new("");
+ else
+ l = strv_parse_nulstr(passphrase+1, n-1);
+ if (!l) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ if (strv_isempty(l)) {
+ l = strv_free(l);
+ log_debug("Invalid packet");
+ continue;
+ }
+
+ break;
+ }
+
+ if (passphrase[0] == '-') {
+ r = -ECANCELED;
+ goto finish;
+ }
+
+ log_debug("Invalid packet");
+ }
+
+ if (keyname)
+ (void) add_to_keyring_and_log(keyname, flags, l);
+
+ *ret = TAKE_PTR(l);
+ r = 0;
+
+finish:
+ if (socket_name)
+ (void) unlink(socket_name);
+
+ (void) unlink(temp);
+
+ if (final[0])
+ (void) unlink(final);
+
+ assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0);
+ return r;
+}
+
+static int ask_password_credential(const char *credential_name, AskPasswordFlags flags, char ***ret) {
+ _cleanup_(erase_and_freep) char *buffer = NULL;
+ size_t size;
+ char **l;
+ int r;
+
+ assert(credential_name);
+ assert(ret);
+
+ r = read_credential(credential_name, (void**) &buffer, &size);
+ if (IN_SET(r, -ENXIO, -ENOENT)) /* No credentials passed or this credential not defined? */
+ return -ENOKEY;
+
+ l = strv_parse_nulstr(buffer, size);
+ if (!l)
+ return -ENOMEM;
+
+ *ret = l;
+ return 0;
+}
+
+int ask_password_auto(
+ const char *message,
+ const char *icon,
+ const char *id, /* id in "ask-password" protocol */
+ const char *key_name, /* name in kernel keyring */
+ const char *credential_name, /* name in $CREDENTIALS_DIRECTORY directory */
+ usec_t until,
+ AskPasswordFlags flags,
+ char ***ret) {
+
+ int r;
+
+ assert(ret);
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_NO_CREDENTIAL) && credential_name) {
+ r = ask_password_credential(credential_name, flags, ret);
+ if (r != -ENOKEY)
+ return r;
+ }
+
+ if (FLAGS_SET(flags, ASK_PASSWORD_ACCEPT_CACHED) &&
+ key_name &&
+ (FLAGS_SET(flags, ASK_PASSWORD_NO_TTY) || !isatty(STDIN_FILENO)) &&
+ FLAGS_SET(flags, ASK_PASSWORD_NO_AGENT)) {
+ r = ask_password_keyring(key_name, flags, ret);
+ if (r != -ENOKEY)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_NO_TTY) && isatty(STDIN_FILENO))
+ return ask_password_tty(-1, message, key_name, until, flags, NULL, ret);
+
+ if (!FLAGS_SET(flags, ASK_PASSWORD_NO_AGENT))
+ return ask_password_agent(message, icon, id, key_name, until, flags, ret);
+
+ return -EUNATCH;
+}
diff --git a/src/shared/ask-password-api.h b/src/shared/ask-password-api.h
new file mode 100644
index 0000000..7464e7f
--- /dev/null
+++ b/src/shared/ask-password-api.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "time-util.h"
+
+typedef enum AskPasswordFlags {
+ ASK_PASSWORD_ACCEPT_CACHED = 1 << 0, /* read from kernel keyring */
+ ASK_PASSWORD_PUSH_CACHE = 1 << 1, /* write to kernel keyring after getting password from elsewhere */
+ ASK_PASSWORD_ECHO = 1 << 2, /* show the password literally while reading, instead of "*" */
+ ASK_PASSWORD_SILENT = 1 << 3, /* do no show any password at all while reading */
+ ASK_PASSWORD_NO_TTY = 1 << 4, /* never ask for password on tty */
+ ASK_PASSWORD_NO_AGENT = 1 << 5, /* never ask for password via agent */
+ ASK_PASSWORD_CONSOLE_COLOR = 1 << 6, /* Use color if /dev/console points to a console that supports color */
+ ASK_PASSWORD_NO_CREDENTIAL = 1 << 7, /* never use $CREDENTIALS_DIRECTORY data */
+ ASK_PASSWORD_HIDE_EMOJI = 1 << 8, /* hide the lock and key emoji */
+} AskPasswordFlags;
+
+int ask_password_tty(int tty_fd, const char *message, const char *key_name, usec_t until, AskPasswordFlags flags, const char *flag_file, char ***ret);
+int ask_password_plymouth(const char *message, usec_t until, AskPasswordFlags flags, const char *flag_file, char ***ret);
+int ask_password_agent(const char *message, const char *icon, const char *id, const char *key_name, usec_t until, AskPasswordFlags flag, char ***ret);
+int ask_password_auto(const char *message, const char *icon, const char *id, const char *key_name, const char *credential_name, usec_t until, AskPasswordFlags flag, char ***ret);
diff --git a/src/shared/async.c b/src/shared/async.c
new file mode 100644
index 0000000..41f6b97
--- /dev/null
+++ b/src/shared/async.c
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "async.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "process-util.h"
+#include "signal-util.h"
+
+int asynchronous_sync(pid_t *ret_pid) {
+ int r;
+
+ /* This forks off an invocation of fork() as a child process, in order to initiate synchronization to
+ * disk. Note that we implement this as helper process rather than thread as we don't want the sync() to hang our
+ * original process ever, and a thread would do that as the process can't exit with threads hanging in blocking
+ * syscalls. */
+
+ r = safe_fork("(sd-sync)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|(ret_pid ? 0 : FORK_DETACH), ret_pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Child process */
+ sync();
+ _exit(EXIT_SUCCESS);
+ }
+
+ return 0;
+}
+
+/* We encode the fd to close in the userdata pointer as an unsigned value. The highest bit indicates whether
+ * we need to fork again */
+#define NEED_DOUBLE_FORK (1U << (sizeof(unsigned) * 8 - 1))
+
+static int close_func(void *p) {
+ unsigned v = PTR_TO_UINT(p);
+
+ (void) prctl(PR_SET_NAME, (unsigned long*) "(sd-close)");
+
+ /* Note: 💣 This function is invoked in a child process created via glibc's clone() wrapper. In such
+ * children memory allocation is not allowed, since glibc does not release malloc mutexes in
+ * clone() 💣 */
+
+ if (v & NEED_DOUBLE_FORK) {
+ pid_t pid;
+
+ v &= ~NEED_DOUBLE_FORK;
+
+ /* This inner child will be reparented to the subreaper/PID 1. Here we turn on SIGCHLD, so
+ * that the reaper knows when it's time to reap. */
+ pid = clone_with_nested_stack(close_func, SIGCHLD|CLONE_FILES, UINT_TO_PTR(v));
+ if (pid >= 0)
+ return 0;
+ }
+
+ close((int) v); /* no assert() here, we are in the child and the result would be eaten up anyway */
+ return 0;
+}
+
+int asynchronous_close(int fd) {
+ unsigned v;
+ pid_t pid;
+ int r;
+
+ /* This is supposed to behave similar to safe_close(), but actually invoke close() asynchronously, so
+ * that it will never block. Ideally the kernel would have an API for this, but it doesn't, so we
+ * work around it, and hide this as a far away as we can.
+ *
+ * It is important to us that we don't use threads (via glibc pthread) in PID 1, hence we'll do a
+ * minimal subprocess instead which shares our fd table via CLONE_FILES. */
+
+ if (fd < 0)
+ return -EBADF; /* already invalid */
+
+ PROTECT_ERRNO;
+
+ v = (unsigned) fd;
+
+ /* We want to fork off a process that is automatically reaped. For that we'd usually double-fork. But
+ * we can optimize this a bit: if we are PID 1 or a subreaper anyway (the systemd service manager
+ * process qualifies as this), we can avoid the double forking, since the double forked process would
+ * be reparented back to us anyway. */
+ r = is_reaper_process();
+ if (r < 0)
+ log_debug_errno(r, "Cannot determine if we are a reaper process, assuming we are not: %m");
+ if (r <= 0)
+ v |= NEED_DOUBLE_FORK;
+
+ pid = clone_with_nested_stack(close_func, CLONE_FILES | ((v & NEED_DOUBLE_FORK) ? 0 : SIGCHLD), UINT_TO_PTR(v));
+ if (pid < 0)
+ assert_se(close_nointr(fd) != -EBADF); /* local fallback */
+ else if (v & NEED_DOUBLE_FORK) {
+
+ /* Reap the intermediate child. Key here is that we specify __WCLONE, since we didn't ask for
+ * any signal to be sent to us on process exit, and otherwise waitid() would refuse waiting
+ * then.
+ *
+ * We usually prefer calling waitid(), but before kernel 4.7 it didn't support __WCLONE while
+ * waitpid() did. Hence let's use waitpid() here, it's good enough for our purposes here. */
+ for (;;)
+ if (waitpid(pid, NULL, __WCLONE) >= 0 || errno != EINTR)
+ break;
+ }
+
+ return -EBADF; /* return an invalidated fd */
+}
+
+int asynchronous_rm_rf(const char *p, RemoveFlags flags) {
+ int r;
+
+ assert(p);
+
+ /* Forks off a child that destroys the specified path. This will be best effort only, i.e. the child
+ * will attempt to do its thing, but we won't wait for it or check its success. */
+
+ r = safe_fork("(sd-rmrf)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DETACH, NULL);
+ if (r != 0)
+ return r;
+
+ /* Child */
+
+ r = rm_rf(p, flags);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to rm -rf '%s', ignoring: %m", p);
+ _exit(EXIT_FAILURE); /* This is a detached process, hence no one really cares, but who knows
+ * maybe it's good for debugging/tracing to return an exit code
+ * indicative of our failure here. */
+ }
+
+ _exit(EXIT_SUCCESS);
+}
diff --git a/src/shared/async.h b/src/shared/async.h
new file mode 100644
index 0000000..96148f9
--- /dev/null
+++ b/src/shared/async.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "macro.h"
+#include "rm-rf.h"
+
+/* These functions implement various potentially slow operations that are executed asynchronously. They are
+ * carefully written to not use pthreads, but use fork() or clone() (without CLONE_VM) so that the child does
+ * not share any memory with the parent process, and thus cannot possibly interfere with the malloc()
+ * synchronization locks.
+ *
+ * Background: glibc only synchronizes malloc() locks when doing fork(), but not when doing clone()
+ * (regardless if through glibc's own wrapper or ours). This means if another thread in the parent has the
+ * malloc() lock taken while a thread is cloning, the mutex will remain locked in the child (but the other
+ * thread won't exist there), with no chance to ever be unlocked again. This will result in deadlocks. Hence
+ * one has to make the choice: either never use threads in the parent, or never do memory allocation in the
+ * child, or never use clone()/clone3() and stick to fork() only. Because we need clone()/clone3() we opted
+ * for avoiding threads. */
+
+int asynchronous_sync(pid_t *ret_pid);
+int asynchronous_close(int fd);
+int asynchronous_rm_rf(const char *p, RemoveFlags flags);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(int, asynchronous_close);
diff --git a/src/shared/barrier.c b/src/shared/barrier.c
new file mode 100644
index 0000000..bd5bdd7
--- /dev/null
+++ b/src/shared/barrier.c
@@ -0,0 +1,394 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/eventfd.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "barrier.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "io-util.h"
+#include "macro.h"
+
+/**
+ * Barriers
+ * This barrier implementation provides a simple synchronization method based
+ * on file-descriptors that can safely be used between threads and processes. A
+ * barrier object contains 2 shared counters based on eventfd. Both processes
+ * can now place barriers and wait for the other end to reach a random or
+ * specific barrier.
+ * Barriers are numbered, so you can either wait for the other end to reach any
+ * barrier or the last barrier that you placed. This way, you can use barriers
+ * for one-way *and* full synchronization. Note that even-though barriers are
+ * numbered, these numbers are internal and recycled once both sides reached the
+ * same barrier (implemented as a simple signed counter). It is thus not
+ * possible to address barriers by their ID.
+ *
+ * Barrier-API: Both ends can place as many barriers via barrier_place() as
+ * they want and each pair of barriers on both sides will be implicitly linked.
+ * Each side can use the barrier_wait/sync_*() family of calls to wait for the
+ * other side to place a specific barrier. barrier_wait_next() waits until the
+ * other side calls barrier_place(). No links between the barriers are
+ * considered and this simply serves as most basic asynchronous barrier.
+ * barrier_sync_next() is like barrier_wait_next() and waits for the other side
+ * to place their next barrier via barrier_place(). However, it only waits for
+ * barriers that are linked to a barrier we already placed. If the other side
+ * already placed more barriers than we did, barrier_sync_next() returns
+ * immediately.
+ * barrier_sync() extends barrier_sync_next() and waits until the other end
+ * placed as many barriers via barrier_place() as we did. If they already placed
+ * as many as we did (or more), it returns immediately.
+ *
+ * Additionally to basic barriers, an abortion event is available.
+ * barrier_abort() places an abortion event that cannot be undone. An abortion
+ * immediately cancels all placed barriers and replaces them. Any running and
+ * following wait/sync call besides barrier_wait_abortion() will immediately
+ * return false on both sides (otherwise, they always return true).
+ * barrier_abort() can be called multiple times on both ends and will be a
+ * no-op if already called on this side.
+ * barrier_wait_abortion() can be used to wait for the other side to call
+ * barrier_abort() and is the only wait/sync call that does not return
+ * immediately if we aborted outself. It only returns once the other side
+ * called barrier_abort().
+ *
+ * Barriers can be used for in-process and inter-process synchronization.
+ * However, for in-process synchronization you could just use mutexes.
+ * Therefore, main target is IPC and we require both sides to *not* share the FD
+ * table. If that's given, barriers provide target tracking: If the remote side
+ * exit()s, an abortion event is implicitly queued on the other side. This way,
+ * a sync/wait call will be woken up if the remote side crashed or exited
+ * unexpectedly. However, note that these abortion events are only queued if the
+ * barrier-queue has been drained. Therefore, it is safe to place a barrier and
+ * exit. The other side can safely wait on the barrier even though the exit
+ * queued an abortion event. Usually, the abortion event would overwrite the
+ * barrier, however, that's not true for exit-abortion events. Those are only
+ * queued if the barrier-queue is drained (thus, the receiving side has placed
+ * more barriers than the remote side).
+ */
+
+/**
+ * barrier_create() - Initialize a barrier object
+ * @obj: barrier to initialize
+ *
+ * This initializes a barrier object. The caller is responsible of allocating
+ * the memory and keeping it valid. The memory does not have to be zeroed
+ * beforehand.
+ * Two eventfd objects are allocated for each barrier. If allocation fails, an
+ * error is returned.
+ *
+ * If this function fails, the barrier is reset to an invalid state so it is
+ * safe to call barrier_destroy() on the object regardless whether the
+ * initialization succeeded or not.
+ *
+ * The caller is responsible to destroy the object via barrier_destroy() before
+ * releasing the underlying memory.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int barrier_create(Barrier *b) {
+ _unused_ _cleanup_(barrier_destroyp) Barrier *staging = b;
+
+ assert(b);
+
+ b->me = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+ if (b->me < 0)
+ return -errno;
+
+ b->them = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
+ if (b->them < 0)
+ return -errno;
+
+ if (pipe2(b->pipe, O_CLOEXEC | O_NONBLOCK) < 0)
+ return -errno;
+
+ staging = NULL;
+ return 0;
+}
+
+/**
+ * barrier_destroy() - Destroy a barrier object
+ * @b: barrier to destroy or NULL
+ *
+ * This destroys a barrier object that has previously been passed to
+ * barrier_create(). The object is released and reset to invalid
+ * state. Therefore, it is safe to call barrier_destroy() multiple
+ * times or even if barrier_create() failed. However, barrier must be
+ * always initialized with BARRIER_NULL.
+ *
+ * If @b is NULL, this is a no-op.
+ */
+Barrier* barrier_destroy(Barrier *b) {
+ if (!b)
+ return NULL;
+
+ b->me = safe_close(b->me);
+ b->them = safe_close(b->them);
+ safe_close_pair(b->pipe);
+ b->barriers = 0;
+ return NULL;
+}
+
+/**
+ * barrier_set_role() - Set the local role of the barrier
+ * @b: barrier to operate on
+ * @role: role to set on the barrier
+ *
+ * This sets the roles on a barrier object. This is needed to know
+ * which side of the barrier you're on. Usually, the parent creates
+ * the barrier via barrier_create() and then calls fork() or clone().
+ * Therefore, the FDs are duplicated and the child retains the same
+ * barrier object.
+ *
+ * Both sides need to call barrier_set_role() after fork() or clone()
+ * are done. If this is not done, barriers will not work correctly.
+ *
+ * Note that barriers could be supported without fork() or clone(). However,
+ * this is currently not needed so it hasn't been implemented.
+ */
+void barrier_set_role(Barrier *b, unsigned role) {
+ assert(b);
+ assert(IN_SET(role, BARRIER_PARENT, BARRIER_CHILD));
+ /* make sure this is only called once */
+ assert(b->pipe[0] >= 0 && b->pipe[1] >= 0);
+
+ if (role == BARRIER_PARENT)
+ b->pipe[1] = safe_close(b->pipe[1]);
+ else {
+ b->pipe[0] = safe_close(b->pipe[0]);
+
+ /* swap me/them for children */
+ SWAP_TWO(b->me, b->them);
+ }
+}
+
+/* places barrier; returns false if we aborted, otherwise true */
+static bool barrier_write(Barrier *b, uint64_t buf) {
+ ssize_t len;
+
+ /* prevent new sync-points if we already aborted */
+ if (barrier_i_aborted(b))
+ return false;
+
+ assert(b->me >= 0);
+ do
+ len = write(b->me, &buf, sizeof(buf));
+ while (len < 0 && ERRNO_IS_TRANSIENT(errno));
+
+ if (len != sizeof(buf))
+ goto error;
+
+ /* lock if we aborted */
+ if (buf >= (uint64_t)BARRIER_ABORTION) {
+ if (barrier_they_aborted(b))
+ b->barriers = BARRIER_WE_ABORTED;
+ else
+ b->barriers = BARRIER_I_ABORTED;
+ } else if (!barrier_is_aborted(b))
+ b->barriers += buf;
+
+ return !barrier_i_aborted(b);
+
+error:
+ /* If there is an unexpected error, we have to make this fatal. There
+ * is no way we can recover from sync-errors. Therefore, we close the
+ * pipe-ends and treat this as abortion. The other end will notice the
+ * pipe-close and treat it as abortion, too. */
+
+ safe_close_pair(b->pipe);
+ b->barriers = BARRIER_WE_ABORTED;
+ return false;
+}
+
+/* waits for barriers; returns false if they aborted, otherwise true */
+static bool barrier_read(Barrier *b, int64_t comp) {
+ if (barrier_they_aborted(b))
+ return false;
+
+ while (b->barriers > comp) {
+ struct pollfd pfd[2] = {
+ { .fd = b->pipe[0] >= 0 ? b->pipe[0] : b->pipe[1],
+ .events = POLLHUP },
+ { .fd = b->them,
+ .events = POLLIN }};
+ uint64_t buf;
+ int r;
+
+ r = ppoll_usec(pfd, ELEMENTSOF(pfd), USEC_INFINITY);
+ if (r == -EINTR)
+ continue;
+ if (r < 0)
+ goto error;
+
+ if (pfd[1].revents) {
+ ssize_t len;
+
+ /* events on @them signal new data for us */
+ len = read(b->them, &buf, sizeof(buf));
+ if (len < 0 && ERRNO_IS_TRANSIENT(errno))
+ continue;
+
+ if (len != sizeof(buf))
+ goto error;
+ } else if (pfd[0].revents & (POLLHUP | POLLERR | POLLNVAL))
+ /* POLLHUP on the pipe tells us the other side exited.
+ * We treat this as implicit abortion. But we only
+ * handle it if there's no event on the eventfd. This
+ * guarantees that exit-abortions do not overwrite real
+ * barriers. */
+ buf = BARRIER_ABORTION;
+ else
+ continue;
+
+ /* lock if they aborted */
+ if (buf >= (uint64_t)BARRIER_ABORTION) {
+ if (barrier_i_aborted(b))
+ b->barriers = BARRIER_WE_ABORTED;
+ else
+ b->barriers = BARRIER_THEY_ABORTED;
+ } else if (!barrier_is_aborted(b))
+ b->barriers -= buf;
+ }
+
+ return !barrier_they_aborted(b);
+
+error:
+ /* If there is an unexpected error, we have to make this fatal. There
+ * is no way we can recover from sync-errors. Therefore, we close the
+ * pipe-ends and treat this as abortion. The other end will notice the
+ * pipe-close and treat it as abortion, too. */
+
+ safe_close_pair(b->pipe);
+ b->barriers = BARRIER_WE_ABORTED;
+ return false;
+}
+
+/**
+ * barrier_place() - Place a new barrier
+ * @b: barrier object
+ *
+ * This places a new barrier on the barrier object. If either side already
+ * aborted, this is a no-op and returns "false". Otherwise, the barrier is
+ * placed and this returns "true".
+ *
+ * Returns: true if barrier was placed, false if either side aborted.
+ */
+bool barrier_place(Barrier *b) {
+ assert(b);
+
+ if (barrier_is_aborted(b))
+ return false;
+
+ barrier_write(b, BARRIER_SINGLE);
+ return true;
+}
+
+/**
+ * barrier_abort() - Abort the synchronization
+ * @b: barrier object to abort
+ *
+ * This aborts the barrier-synchronization. If barrier_abort() was already
+ * called on this side, this is a no-op. Otherwise, the barrier is put into the
+ * ABORT-state and will stay there. The other side is notified about the
+ * abortion. Any following attempt to place normal barriers or to wait on normal
+ * barriers will return immediately as "false".
+ *
+ * You can wait for the other side to call barrier_abort(), too. Use
+ * barrier_wait_abortion() for that.
+ *
+ * Returns: false if the other side already aborted, true otherwise.
+ */
+bool barrier_abort(Barrier *b) {
+ assert(b);
+
+ barrier_write(b, BARRIER_ABORTION);
+ return !barrier_they_aborted(b);
+}
+
+/**
+ * barrier_wait_next() - Wait for the next barrier of the other side
+ * @b: barrier to operate on
+ *
+ * This waits until the other side places its next barrier. This is independent
+ * of any barrier-links and just waits for any next barrier of the other side.
+ *
+ * If either side aborted, this returns false.
+ *
+ * Returns: false if either side aborted, true otherwise.
+ */
+bool barrier_wait_next(Barrier *b) {
+ assert(b);
+
+ if (barrier_is_aborted(b))
+ return false;
+
+ barrier_read(b, b->barriers - 1);
+ return !barrier_is_aborted(b);
+}
+
+/**
+ * barrier_wait_abortion() - Wait for the other side to abort
+ * @b: barrier to operate on
+ *
+ * This waits until the other side called barrier_abort(). This can be called
+ * regardless whether the local side already called barrier_abort() or not.
+ *
+ * If the other side has already aborted, this returns immediately.
+ *
+ * Returns: false if the local side aborted, true otherwise.
+ */
+bool barrier_wait_abortion(Barrier *b) {
+ assert(b);
+
+ barrier_read(b, BARRIER_THEY_ABORTED);
+ return !barrier_i_aborted(b);
+}
+
+/**
+ * barrier_sync_next() - Wait for the other side to place a next linked barrier
+ * @b: barrier to operate on
+ *
+ * This is like barrier_wait_next() and waits for the other side to call
+ * barrier_place(). However, this only waits for linked barriers. That means, if
+ * the other side already placed more barriers than (or as much as) we did, this
+ * returns immediately instead of waiting.
+ *
+ * If either side aborted, this returns false.
+ *
+ * Returns: false if either side aborted, true otherwise.
+ */
+bool barrier_sync_next(Barrier *b) {
+ assert(b);
+
+ if (barrier_is_aborted(b))
+ return false;
+
+ barrier_read(b, MAX((int64_t)0, b->barriers - 1));
+ return !barrier_is_aborted(b);
+}
+
+/**
+ * barrier_sync() - Wait for the other side to place as many barriers as we did
+ * @b: barrier to operate on
+ *
+ * This is like barrier_sync_next() but waits for the other side to call
+ * barrier_place() as often as we did (in total). If they already placed as much
+ * as we did (or more), this returns immediately instead of waiting.
+ *
+ * If either side aborted, this returns false.
+ *
+ * Returns: false if either side aborted, true otherwise.
+ */
+bool barrier_sync(Barrier *b) {
+ assert(b);
+
+ if (barrier_is_aborted(b))
+ return false;
+
+ barrier_read(b, 0);
+ return !barrier_is_aborted(b);
+}
diff --git a/src/shared/barrier.h b/src/shared/barrier.h
new file mode 100644
index 0000000..4ee2040
--- /dev/null
+++ b/src/shared/barrier.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "macro.h"
+
+/* See source file for an API description. */
+
+typedef struct Barrier Barrier;
+
+enum {
+ BARRIER_SINGLE = 1LL,
+ BARRIER_ABORTION = INT64_MAX,
+
+ /* bias values to store state; keep @WE < @THEY < @I */
+ BARRIER_BIAS = INT64_MIN,
+ BARRIER_WE_ABORTED = BARRIER_BIAS + 1LL,
+ BARRIER_THEY_ABORTED = BARRIER_BIAS + 2LL,
+ BARRIER_I_ABORTED = BARRIER_BIAS + 3LL,
+};
+
+enum {
+ BARRIER_PARENT,
+ BARRIER_CHILD,
+};
+
+struct Barrier {
+ int me;
+ int them;
+ int pipe[2];
+ int64_t barriers;
+};
+
+#define BARRIER_NULL {-EBADF, -EBADF, {-EBADF, -EBADF}, 0}
+
+int barrier_create(Barrier *obj);
+Barrier* barrier_destroy(Barrier *b);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Barrier*, barrier_destroy);
+
+void barrier_set_role(Barrier *b, unsigned role);
+
+bool barrier_place(Barrier *b);
+bool barrier_abort(Barrier *b);
+
+bool barrier_wait_next(Barrier *b);
+bool barrier_wait_abortion(Barrier *b);
+bool barrier_sync_next(Barrier *b);
+bool barrier_sync(Barrier *b);
+
+static inline bool barrier_i_aborted(Barrier *b) {
+ return IN_SET(b->barriers, BARRIER_I_ABORTED, BARRIER_WE_ABORTED);
+}
+
+static inline bool barrier_they_aborted(Barrier *b) {
+ return IN_SET(b->barriers, BARRIER_THEY_ABORTED, BARRIER_WE_ABORTED);
+}
+
+static inline bool barrier_we_aborted(Barrier *b) {
+ return b->barriers == BARRIER_WE_ABORTED;
+}
+
+static inline bool barrier_is_aborted(Barrier *b) {
+ return IN_SET(b->barriers,
+ BARRIER_I_ABORTED, BARRIER_THEY_ABORTED, BARRIER_WE_ABORTED);
+}
+
+static inline bool barrier_place_and_sync(Barrier *b) {
+ (void) barrier_place(b);
+ return barrier_sync(b);
+}
diff --git a/src/shared/base-filesystem.c b/src/shared/base-filesystem.c
new file mode 100644
index 0000000..569ef46
--- /dev/null
+++ b/src/shared/base-filesystem.c
@@ -0,0 +1,210 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <syslog.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "base-filesystem.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "nulstr-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+typedef struct BaseFilesystem {
+ const char *dir; /* directory or symlink to create */
+ mode_t mode;
+ const char *target; /* if non-NULL create as symlink to this target */
+ const char *exists; /* conditionalize this entry on existence of this file */
+ bool ignore_failure;
+} BaseFilesystem;
+
+static const BaseFilesystem table[] = {
+ { "bin", 0, "usr/bin\0", NULL },
+ { "lib", 0, "usr/lib\0", NULL },
+ { "root", 0750, NULL, NULL, true },
+ { "sbin", 0, "usr/sbin\0", NULL },
+ { "usr", 0755, NULL, NULL },
+ { "var", 0755, NULL, NULL },
+ { "etc", 0755, NULL, NULL },
+ { "proc", 0555, NULL, NULL, true },
+ { "sys", 0555, NULL, NULL, true },
+ { "dev", 0555, NULL, NULL, true },
+ { "run", 0555, NULL, NULL, true },
+ /* We don't add /tmp/ here for now (even though it's necessary for regular operation), because we
+ * want to support both cases where /tmp/ is a mount of its own (in which case we probably should set
+ * the mode to 1555, to indicate that no one should write to it, not even root) and when it's part of
+ * the rootfs (in which case we should set mode 1777), and we simply don't know what's right. */
+
+ /* Various architecture ABIs define the path to the dynamic loader via the /lib64/ subdirectory of
+ * the root directory. When booting from an otherwise empty root file system (where only /usr/ has
+ * been mounted into) it is thus necessary to create a symlink pointing to the right subdirectory of
+ * /usr/ first — otherwise we couldn't invoke any dynamic binary. Let's detect this case here, and
+ * create the symlink as needed should it be missing. We prefer doing this consistently with Debian's
+ * multiarch logic, but support Fedora-style and Arch-style multilib too. */
+#if defined(__aarch64__)
+ /* aarch64 ELF ABI actually says dynamic loader is in /lib/, but Fedora puts it in /lib64/ anyway and
+ * just symlinks /lib/ld-linux-aarch64.so.1 to ../lib64/ld-linux-aarch64.so.1. For this to work
+ * correctly, /lib64/ must be symlinked to /usr/lib64/. */
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-linux-aarch64.so.1" },
+# define KNOW_LIB64_DIRS 1
+#elif defined(__alpha__)
+#elif defined(__arc__) || defined(__tilegx__)
+#elif defined(__arm__)
+ /* No /lib64 on arm. The linker is /lib/ld-linux-armhf.so.3. */
+# define KNOW_LIB64_DIRS 1
+#elif defined(__i386__) || defined(__x86_64__)
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-linux-x86-64.so.2" },
+# define KNOW_LIB64_DIRS 1
+#elif defined(__ia64__)
+#elif defined(__loongarch_lp64)
+# define KNOW_LIB64_DIRS 1
+# if defined(__loongarch_double_float)
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-linux-loongarch-lp64d.so.1" },
+# elif defined(__loongarch_single_float)
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-linux-loongarch-lp64f.so.1" },
+# elif defined(__loongarch_soft_float)
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-linux-loongarch-lp64s.so.1" },
+# else
+# error "Unknown LoongArch ABI"
+# endif
+#elif defined(__m68k__)
+ /* No link needed. */
+# define KNOW_LIB64_DIRS 1
+#elif defined(_MIPS_SIM)
+# if _MIPS_SIM == _MIPS_SIM_ABI32
+# elif _MIPS_SIM == _MIPS_SIM_NABI32
+# elif _MIPS_SIM == _MIPS_SIM_ABI64
+# else
+# error "Unknown MIPS ABI"
+# endif
+#elif defined(__powerpc__)
+# if defined(__PPC64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld64.so.2" },
+# define KNOW_LIB64_DIRS 1
+# elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ /* powerpc64-linux-gnu */
+# else
+ /* powerpc-linux-gnu */
+# endif
+#elif defined(__riscv)
+# if __riscv_xlen == 32
+# elif __riscv_xlen == 64
+ /* Same situation as for aarch64 */
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-linux-riscv64-lp64d.so.1" },
+# define KNOW_LIB64_DIRS 1
+# else
+# error "Unknown RISC-V ABI"
+# endif
+#elif defined(__s390__)
+ /* s390-linux-gnu */
+#elif defined(__s390x__)
+ { "lib64", 0, "usr/lib/"LIB_ARCH_TUPLE"\0"
+ "usr/lib64\0"
+ "usr/lib\0", "ld-lsb-s390x.so.3" },
+# define KNOW_LIB64_DIRS 1
+#elif defined(__sparc__)
+#endif
+ /* gcc doesn't allow pragma to be used within constructs, hence log about this separately below */
+};
+
+#ifndef KNOW_LIB64_DIRS
+# pragma message "Please add an entry above specifying whether your architecture uses /lib64/, /lib32/, or no such links."
+#endif
+
+int base_filesystem_create_fd(int fd, const char *root, uid_t uid, gid_t gid) {
+ int r;
+
+ assert(fd >= 0);
+ assert(root);
+
+ /* The "root" parameter is decoration only – it's only used as part of log messages */
+
+ for (size_t i = 0; i < ELEMENTSOF(table); i++) {
+ if (faccessat(fd, table[i].dir, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
+ continue;
+
+ if (table[i].target) { /* Create as symlink? */
+ const char *target = NULL;
+
+ /* check if one of the targets exists */
+ NULSTR_FOREACH(s, table[i].target) {
+ if (faccessat(fd, s, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ continue;
+
+ /* check if a specific file exists at the target path */
+ if (table[i].exists) {
+ _cleanup_free_ char *p = NULL;
+
+ p = path_join(s, table[i].exists);
+ if (!p)
+ return log_oom();
+
+ if (faccessat(fd, p, F_OK, AT_SYMLINK_NOFOLLOW) < 0)
+ continue;
+ }
+
+ target = s;
+ break;
+ }
+
+ if (!target)
+ continue;
+
+ r = RET_NERRNO(symlinkat(target, fd, table[i].dir));
+ } else {
+ /* Create as directory. */
+ WITH_UMASK(0000)
+ r = RET_NERRNO(mkdirat(fd, table[i].dir, table[i].mode));
+ }
+ if (r < 0) {
+ bool ignore = IN_SET(r, -EEXIST, -EROFS) || table[i].ignore_failure;
+ log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
+ "Failed to create %s/%s: %m", root, table[i].dir);
+ if (ignore)
+ continue;
+
+ return r;
+ }
+
+ if (uid_is_valid(uid) || gid_is_valid(gid))
+ if (fchownat(fd, table[i].dir, uid, gid, AT_SYMLINK_NOFOLLOW) < 0)
+ return log_error_errno(errno, "Failed to chown %s/%s: %m", root, table[i].dir);
+ }
+
+ return 0;
+}
+
+int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(ASSERT_PTR(root), O_DIRECTORY|O_CLOEXEC);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open root file system: %m");
+
+ return base_filesystem_create_fd(fd, root, uid, gid);
+}
diff --git a/src/shared/base-filesystem.h b/src/shared/base-filesystem.h
new file mode 100644
index 0000000..a1ccf45
--- /dev/null
+++ b/src/shared/base-filesystem.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+int base_filesystem_create_fd(int fd, const char *root, uid_t uid, gid_t gid);
+int base_filesystem_create(const char *root, uid_t uid, gid_t gid);
diff --git a/src/shared/battery-util.c b/src/shared/battery-util.c
new file mode 100644
index 0000000..37b3f6a
--- /dev/null
+++ b/src/shared/battery-util.c
@@ -0,0 +1,283 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-device.h"
+
+#include "device-private.h"
+#include "device-util.h"
+#include "string-util.h"
+#include "battery-util.h"
+
+#define BATTERY_LOW_CAPACITY_LEVEL 5
+
+static int device_is_power_sink(sd_device *device) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ bool found_source = false, found_sink = false;
+ sd_device *parent;
+ int r;
+
+ assert(device);
+
+ /* USB-C power supply device has two power roles: source or sink. See,
+ * https://docs.kernel.org/admin-guide/abi-testing.html#abi-file-testing-sysfs-class-typec */
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_allow_uninitialized(e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_subsystem(e, "typec", true);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_parent(device, &parent);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_parent(e, parent);
+ if (r < 0)
+ return r;
+
+ FOREACH_DEVICE(e, d) {
+ const char *val;
+
+ r = sd_device_get_sysattr_value(d, "power_role", &val);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_device_debug_errno(d, r, "Failed to read 'power_role' sysfs attribute, ignoring: %m");
+ continue;
+ }
+
+ if (strstr(val, "[source]")) {
+ found_source = true;
+ log_device_debug(d, "The USB type-C port is in power source mode.");
+ } else if (strstr(val, "[sink]")) {
+ found_sink = true;
+ log_device_debug(d, "The USB type-C port is in power sink mode.");
+ }
+ }
+
+ if (found_sink)
+ log_device_debug(device, "The USB type-C device has at least one port in power sink mode.");
+ else if (!found_source)
+ log_device_debug(device, "The USB type-C device has no port in power source mode, assuming the device is in power sink mode.");
+ else
+ log_device_debug(device, "All USB type-C ports are in power source mode.");
+
+ return found_sink || !found_source;
+}
+
+static bool battery_is_discharging(sd_device *d) {
+ const char *val;
+ int r;
+
+ assert(d);
+
+ r = sd_device_get_sysattr_value(d, "scope", &val);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_device_debug_errno(d, r, "Failed to read 'scope' sysfs attribute, ignoring: %m");
+ } else if (streq(val, "Device")) {
+ log_device_debug(d, "The power supply is a device battery, ignoring device.");
+ return false;
+ }
+
+ r = device_get_sysattr_bool(d, "present");
+ if (r < 0)
+ log_device_debug_errno(d, r, "Failed to read 'present' sysfs attribute, assuming the battery is present: %m");
+ else if (r == 0) {
+ log_device_debug(d, "The battery is not present, ignoring the power supply.");
+ return false;
+ }
+
+ /* Possible values: "Unknown", "Charging", "Discharging", "Not charging", "Full" */
+ r = sd_device_get_sysattr_value(d, "status", &val);
+ if (r < 0) {
+ log_device_debug_errno(d, r, "Failed to read 'status' sysfs attribute, assuming the battery is discharging: %m");
+ return true;
+ }
+ if (!streq(val, "Discharging")) {
+ log_device_debug(d, "The battery status is '%s', assuming the battery is not used as a power source of this machine.", val);
+ return false;
+ }
+
+ return true;
+}
+
+int on_ac_power(void) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ bool found_ac_online = false, found_discharging_battery = false;
+ int r;
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_allow_uninitialized(e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_subsystem(e, "power_supply", true);
+ if (r < 0)
+ return r;
+
+ FOREACH_DEVICE(e, d) {
+ /* See
+ * https://github.com/torvalds/linux/blob/4eef766b7d4d88f0b984781bc1bcb574a6eafdc7/include/linux/power_supply.h#L176
+ * for defined power source types. Also see:
+ * https://docs.kernel.org/admin-guide/abi-testing.html#abi-file-testing-sysfs-class-power */
+
+ const char *val;
+ r = sd_device_get_sysattr_value(d, "type", &val);
+ if (r < 0) {
+ log_device_debug_errno(d, r, "Failed to read 'type' sysfs attribute, ignoring device: %m");
+ continue;
+ }
+
+ /* Ignore USB-C power supply in source mode. See issue #21988. */
+ if (streq(val, "USB")) {
+ r = device_is_power_sink(d);
+ if (r <= 0) {
+ if (r < 0)
+ log_device_debug_errno(d, r, "Failed to determine the current power role, ignoring device: %m");
+ else
+ log_device_debug(d, "USB power supply is in source mode, ignoring device.");
+ continue;
+ }
+ }
+
+ if (streq(val, "Battery")) {
+ if (battery_is_discharging(d)) {
+ found_discharging_battery = true;
+ log_device_debug(d, "The power supply is a battery and currently discharging.");
+ }
+ continue;
+ }
+
+ r = device_get_sysattr_unsigned(d, "online", NULL);
+ if (r < 0) {
+ log_device_debug_errno(d, r, "Failed to query 'online' sysfs attribute, ignoring device: %m");
+ continue;
+ } else if (r > 0) /* At least 1 and 2 are defined as different types of 'online' */
+ found_ac_online = true;
+
+ log_device_debug(d, "The power supply is currently %s.", r > 0 ? "online" : "offline");
+ }
+
+ if (found_ac_online) {
+ log_debug("Found at least one online non-battery power supply, system is running on AC.");
+ return true;
+ } else if (found_discharging_battery) {
+ log_debug("Found at least one discharging battery and no online power sources, assuming system is running from battery.");
+ return false;
+ } else {
+ log_debug("No power supply reported online and no discharging battery found, assuming system is running on AC.");
+ return true;
+ }
+}
+
+/* Get the list of batteries */
+int battery_enumerator_new(sd_device_enumerator **ret) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ int r;
+
+ assert(ret);
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_subsystem(e, "power_supply", /* match = */ true);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_allow_uninitialized(e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_sysattr(e, "type", "Battery", /* match = */ true);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_sysattr(e, "present", "1", /* match = */ true);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_sysattr(e, "scope", "Device", /* match = */ false);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(e);
+ return 0;
+}
+
+/* Battery percentage capacity fetched from capacity file and if in range 0-100 then returned */
+int battery_read_capacity_percentage(sd_device *dev) {
+ int battery_capacity, r;
+
+ assert(dev);
+
+ r = device_get_sysattr_int(dev, "capacity", &battery_capacity);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to read/parse POWER_SUPPLY_CAPACITY: %m");
+
+ if (battery_capacity < 0 || battery_capacity > 100)
+ return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ERANGE), "Invalid battery capacity: %d", battery_capacity);
+
+ return battery_capacity;
+}
+
+/* If a battery whose percentage capacity is <= 5% exists, and we're not on AC power, return success */
+int battery_is_discharging_and_low(void) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ bool unsure = false, found_low = false;
+ int r;
+
+ /* We have not used battery capacity_level since value is set to full
+ * or Normal in case ACPI is not working properly. In case of no battery
+ * 0 will be returned and system will be suspended for 1st cycle then hibernated */
+
+ r = on_ac_power();
+ if (r < 0)
+ log_warning_errno(r, "Failed to check if the system is running on AC, assuming it is not: %m");
+ if (r > 0)
+ return false;
+
+ r = battery_enumerator_new(&e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to initialize battery enumerator: %m");
+
+ FOREACH_DEVICE(e, dev) {
+ int level;
+
+ level = battery_read_capacity_percentage(dev);
+ if (level < 0) {
+ unsure = true;
+ continue;
+ }
+
+ if (level > BATTERY_LOW_CAPACITY_LEVEL) { /* Found a charged battery */
+ log_device_full(dev,
+ found_low ? LOG_INFO : LOG_DEBUG,
+ "Found battery with capacity above threshold (%d%% > %d%%).",
+ level, BATTERY_LOW_CAPACITY_LEVEL);
+ return false;
+ }
+
+ log_device_info(dev,
+ "Found battery with capacity below threshold (%d%% <= %d%%).",
+ level, BATTERY_LOW_CAPACITY_LEVEL);
+ found_low = true;
+ }
+
+ /* If we found a battery whose state we couldn't read, don't assume we are in low battery state */
+ if (unsure) {
+ log_notice("Found battery with unreadable state, assuming not in low battery state.");
+ return false;
+ }
+
+ /* If found neither charged nor low batteries, assume that we aren't in low battery state */
+ return found_low;
+}
diff --git a/src/shared/battery-util.h b/src/shared/battery-util.h
new file mode 100644
index 0000000..c58f30b
--- /dev/null
+++ b/src/shared/battery-util.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-device.h"
+
+int on_ac_power(void);
+
+int battery_is_discharging_and_low(void);
+
+int battery_enumerator_new(sd_device_enumerator **ret);
+int battery_read_capacity_percentage(sd_device *dev);
diff --git a/src/shared/binfmt-util.c b/src/shared/binfmt-util.c
new file mode 100644
index 0000000..a261754
--- /dev/null
+++ b/src/shared/binfmt-util.c
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <sys/vfs.h>
+
+#include "binfmt-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "missing_magic.h"
+#include "stat-util.h"
+
+int binfmt_mounted(void) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ fd = RET_NERRNO(open("/proc/sys/fs/binfmt_misc", O_CLOEXEC | O_DIRECTORY | O_PATH));
+ if (fd == -ENOENT)
+ return false;
+ if (fd < 0)
+ return fd;
+
+ r = fd_is_fs_type(fd, BINFMTFS_MAGIC);
+ if (r <= 0)
+ return r;
+
+ return access_fd(fd, W_OK) >= 0;
+}
+
+int disable_binfmt(void) {
+ int r;
+
+ /* Flush out all rules. This is important during shutdown to cover for rules using "F", since those
+ * might pin a file and thus block us from unmounting stuff cleanly.
+ *
+ * We are a bit careful here, since binfmt_misc might still be an autofs which we don't want to
+ * trigger. */
+
+ r = binfmt_mounted();
+ if (r < 0)
+ return log_warning_errno(r, "Failed to determine whether binfmt_misc is mounted: %m");
+ if (r == 0) {
+ log_debug("binfmt_misc is not mounted in read-write mode, not detaching entries.");
+ return 0;
+ }
+
+ r = write_string_file("/proc/sys/fs/binfmt_misc/status", "-1", WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to unregister binfmt_misc entries: %m");
+
+ log_debug("Unregistered all remaining binfmt_misc entries.");
+ return 0;
+}
diff --git a/src/shared/binfmt-util.h b/src/shared/binfmt-util.h
new file mode 100644
index 0000000..13f4548
--- /dev/null
+++ b/src/shared/binfmt-util.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int binfmt_mounted(void);
+int disable_binfmt(void);
diff --git a/src/shared/bitmap.c b/src/shared/bitmap.c
new file mode 100644
index 0000000..6cf08b8
--- /dev/null
+++ b/src/shared/bitmap.c
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "alloc-util.h"
+#include "bitmap.h"
+#include "hashmap.h"
+#include "macro.h"
+#include "memory-util.h"
+
+/* Bitmaps are only meant to store relatively small numbers
+ * (corresponding to, say, an enum), so it is ok to limit
+ * the max entry. 64k should be plenty. */
+#define BITMAPS_MAX_ENTRY 0xffff
+
+/* This indicates that we reached the end of the bitmap */
+#define BITMAP_END (UINT_MAX)
+
+#define BITMAP_NUM_TO_OFFSET(n) ((n) / (sizeof(uint64_t) * 8))
+#define BITMAP_NUM_TO_REM(n) ((n) % (sizeof(uint64_t) * 8))
+#define BITMAP_OFFSET_TO_NUM(offset, rem) ((offset) * sizeof(uint64_t) * 8 + (rem))
+
+Bitmap* bitmap_new(void) {
+ return new0(Bitmap, 1);
+}
+
+Bitmap* bitmap_copy(Bitmap *b) {
+ Bitmap *ret;
+
+ ret = bitmap_new();
+ if (!ret)
+ return NULL;
+
+ ret->bitmaps = newdup(uint64_t, b->bitmaps, b->n_bitmaps);
+ if (!ret->bitmaps)
+ return mfree(ret);
+
+ ret->n_bitmaps = b->n_bitmaps;
+ return ret;
+}
+
+Bitmap* bitmap_free(Bitmap *b) {
+ if (!b)
+ return NULL;
+
+ free(b->bitmaps);
+ return mfree(b);
+}
+
+int bitmap_ensure_allocated(Bitmap **b) {
+ Bitmap *a;
+
+ assert(b);
+
+ if (*b)
+ return 0;
+
+ a = bitmap_new();
+ if (!a)
+ return -ENOMEM;
+
+ *b = a;
+
+ return 0;
+}
+
+int bitmap_set(Bitmap *b, unsigned n) {
+ uint64_t bitmask;
+ unsigned offset;
+
+ assert(b);
+
+ /* we refuse to allocate huge bitmaps */
+ if (n > BITMAPS_MAX_ENTRY)
+ return -ERANGE;
+
+ offset = BITMAP_NUM_TO_OFFSET(n);
+
+ if (offset >= b->n_bitmaps) {
+ if (!GREEDY_REALLOC0(b->bitmaps, offset + 1))
+ return -ENOMEM;
+
+ b->n_bitmaps = offset + 1;
+ }
+
+ bitmask = UINT64_C(1) << BITMAP_NUM_TO_REM(n);
+
+ b->bitmaps[offset] |= bitmask;
+
+ return 0;
+}
+
+void bitmap_unset(Bitmap *b, unsigned n) {
+ uint64_t bitmask;
+ unsigned offset;
+
+ if (!b)
+ return;
+
+ offset = BITMAP_NUM_TO_OFFSET(n);
+
+ if (offset >= b->n_bitmaps)
+ return;
+
+ bitmask = UINT64_C(1) << BITMAP_NUM_TO_REM(n);
+
+ b->bitmaps[offset] &= ~bitmask;
+}
+
+bool bitmap_isset(const Bitmap *b, unsigned n) {
+ uint64_t bitmask;
+ unsigned offset;
+
+ if (!b)
+ return false;
+
+ offset = BITMAP_NUM_TO_OFFSET(n);
+
+ if (offset >= b->n_bitmaps)
+ return false;
+
+ bitmask = UINT64_C(1) << BITMAP_NUM_TO_REM(n);
+
+ return !!(b->bitmaps[offset] & bitmask);
+}
+
+bool bitmap_isclear(const Bitmap *b) {
+ unsigned i;
+
+ if (!b)
+ return true;
+
+ for (i = 0; i < b->n_bitmaps; i++)
+ if (b->bitmaps[i] != 0)
+ return false;
+
+ return true;
+}
+
+void bitmap_clear(Bitmap *b) {
+ if (!b)
+ return;
+
+ b->bitmaps = mfree(b->bitmaps);
+ b->n_bitmaps = 0;
+}
+
+bool bitmap_iterate(const Bitmap *b, Iterator *i, unsigned *n) {
+ uint64_t bitmask;
+ unsigned offset, rem;
+
+ assert(i);
+ assert(n);
+
+ if (!b || i->idx == BITMAP_END)
+ return false;
+
+ offset = BITMAP_NUM_TO_OFFSET(i->idx);
+ rem = BITMAP_NUM_TO_REM(i->idx);
+ bitmask = UINT64_C(1) << rem;
+
+ for (; offset < b->n_bitmaps; offset ++) {
+ if (b->bitmaps[offset]) {
+ for (; bitmask; bitmask <<= 1, rem ++) {
+ if (b->bitmaps[offset] & bitmask) {
+ *n = BITMAP_OFFSET_TO_NUM(offset, rem);
+ i->idx = *n + 1;
+
+ return true;
+ }
+ }
+ }
+
+ rem = 0;
+ bitmask = 1;
+ }
+
+ i->idx = BITMAP_END;
+
+ return false;
+}
+
+bool bitmap_equal(const Bitmap *a, const Bitmap *b) {
+ size_t common_n_bitmaps;
+ const Bitmap *c;
+ unsigned i;
+
+ if (a == b)
+ return true;
+
+ if (!a != !b)
+ return false;
+
+ if (!a)
+ return true;
+
+ common_n_bitmaps = MIN(a->n_bitmaps, b->n_bitmaps);
+ if (memcmp_safe(a->bitmaps, b->bitmaps, sizeof(uint64_t) * common_n_bitmaps) != 0)
+ return false;
+
+ c = a->n_bitmaps > b->n_bitmaps ? a : b;
+ for (i = common_n_bitmaps; i < c->n_bitmaps; i++)
+ if (c->bitmaps[i] != 0)
+ return false;
+
+ return true;
+}
diff --git a/src/shared/bitmap.h b/src/shared/bitmap.h
new file mode 100644
index 0000000..e77e2e1
--- /dev/null
+++ b/src/shared/bitmap.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "hashmap.h"
+#include "macro.h"
+
+typedef struct Bitmap {
+ uint64_t *bitmaps;
+ size_t n_bitmaps;
+} Bitmap;
+
+Bitmap* bitmap_new(void);
+Bitmap* bitmap_copy(Bitmap *b);
+int bitmap_ensure_allocated(Bitmap **b);
+Bitmap* bitmap_free(Bitmap *b);
+
+int bitmap_set(Bitmap *b, unsigned n);
+void bitmap_unset(Bitmap *b, unsigned n);
+bool bitmap_isset(const Bitmap *b, unsigned n);
+bool bitmap_isclear(const Bitmap *b);
+void bitmap_clear(Bitmap *b);
+
+bool bitmap_iterate(const Bitmap *b, Iterator *i, unsigned *n);
+
+bool bitmap_equal(const Bitmap *a, const Bitmap *b);
+
+#define _BITMAP_FOREACH(n, b, i) \
+ for (Iterator i = {}; bitmap_iterate((b), &i, (unsigned*)&(n)); )
+#define BITMAP_FOREACH(n, b) \
+ _BITMAP_FOREACH(n, b, UNIQ_T(i, UNIQ))
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Bitmap*, bitmap_free);
+
+#define _cleanup_bitmap_free_ _cleanup_(bitmap_freep)
diff --git a/src/shared/blkid-util.h b/src/shared/blkid-util.h
new file mode 100644
index 0000000..abc4b61
--- /dev/null
+++ b/src/shared/blkid-util.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_BLKID
+# include <blkid.h>
+
+# include "sd-id128.h"
+
+# include "macro.h"
+# include "string-util.h"
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(blkid_probe, blkid_free_probe, NULL);
+
+static inline int blkid_partition_get_uuid_id128(blkid_partition p, sd_id128_t *ret) {
+ const char *s;
+
+ assert(p);
+
+ s = blkid_partition_get_uuid(p);
+ if (isempty(s))
+ return -ENXIO;
+
+ return sd_id128_from_string(s, ret);
+}
+
+static inline int blkid_partition_get_type_id128(blkid_partition p, sd_id128_t *ret) {
+ const char *s;
+
+ assert(p);
+
+ s = blkid_partition_get_type_string(p);
+ if (isempty(s))
+ return -ENXIO;
+
+ return sd_id128_from_string(s, ret);
+}
+
+/* Define symbolic names for blkid_do_safeprobe() return values, since blkid only uses literal numbers. We
+ * prefix these symbolic definitions with underscores, to not invade libblkid's namespace needlessly. */
+enum {
+ _BLKID_SAFEPROBE_FOUND = 0,
+ _BLKID_SAFEPROBE_NOT_FOUND = 1,
+ _BLKID_SAFEPROBE_AMBIGUOUS = -2,
+ _BLKID_SAFEPROBE_ERROR = -1,
+};
+
+#endif
diff --git a/src/shared/blockdev-util.c b/src/shared/blockdev-util.c
new file mode 100644
index 0000000..c906aec
--- /dev/null
+++ b/src/shared/blockdev-util.c
@@ -0,0 +1,828 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <linux/blkpg.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "sd-device.h"
+
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "dirent-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "missing_magic.h"
+#include "parse-util.h"
+
+static int fd_get_devnum(int fd, BlockDeviceLookupFlag flags, dev_t *ret) {
+ struct stat st;
+ dev_t devnum;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (S_ISBLK(st.st_mode))
+ devnum = st.st_rdev;
+ else if (!FLAGS_SET(flags, BLOCK_DEVICE_LOOKUP_BACKING))
+ return -ENOTBLK;
+ else if (!S_ISREG(st.st_mode) && !S_ISDIR(st.st_mode))
+ return -ENOTBLK;
+ else if (major(st.st_dev) != 0)
+ devnum = st.st_dev;
+ else {
+ /* If major(st.st_dev) is zero, this might mean we are backed by btrfs, which needs special
+ * handing, to get the backing device node. */
+
+ r = btrfs_get_block_device_fd(fd, &devnum);
+ if (r == -ENOTTY) /* not btrfs */
+ return -ENOTBLK;
+ if (r < 0)
+ return r;
+ }
+
+ *ret = devnum;
+ return 0;
+}
+
+int block_device_is_whole_disk(sd_device *dev) {
+ const char *s;
+ int r;
+
+ assert(dev);
+
+ r = sd_device_get_subsystem(dev, &s);
+ if (r < 0)
+ return r;
+
+ if (!streq(s, "block"))
+ return -ENOTBLK;
+
+ r = sd_device_get_devtype(dev, &s);
+ if (r < 0)
+ return r;
+
+ return streq(s, "disk");
+}
+
+int block_device_get_whole_disk(sd_device *dev, sd_device **ret) {
+ int r;
+
+ assert(dev);
+ assert(ret);
+
+ /* Do not unref returned sd_device object. */
+
+ r = block_device_is_whole_disk(dev);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ r = sd_device_get_parent(dev, &dev);
+ if (r == -ENOENT) /* Already removed? Let's return a recognizable error. */
+ return -ENODEV;
+ if (r < 0)
+ return r;
+
+ r = block_device_is_whole_disk(dev);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENXIO;
+ }
+
+ *ret = dev;
+ return 0;
+}
+
+int block_device_get_originating(sd_device *dev, sd_device **ret) {
+ _cleanup_(sd_device_unrefp) sd_device *first_found = NULL;
+ const char *suffix;
+ dev_t devnum = 0; /* avoid false maybe-uninitialized warning */
+
+ /* For the specified block device tries to chase it through the layers, in case LUKS-style DM
+ * stacking is used, trying to find the next underlying layer. */
+
+ assert(dev);
+ assert(ret);
+
+ FOREACH_DEVICE_CHILD_WITH_SUFFIX(dev, child, suffix) {
+ sd_device *child_whole_disk;
+ dev_t n;
+
+ if (!path_startswith(suffix, "slaves"))
+ continue;
+
+ if (block_device_get_whole_disk(child, &child_whole_disk) < 0)
+ continue;
+
+ if (sd_device_get_devnum(child_whole_disk, &n) < 0)
+ continue;
+
+ if (!first_found) {
+ first_found = sd_device_ref(child);
+ devnum = n;
+ continue;
+ }
+
+ /* We found a device backed by multiple other devices. We don't really support automatic
+ * discovery on such setups, with the exception of dm-verity partitions. In this case there
+ * are two backing devices: the data partition and the hash partition. We are fine with such
+ * setups, however, only if both partitions are on the same physical device. Hence, let's
+ * verify this by iterating over every node in the 'slaves/' directory and comparing them with
+ * the first that gets returned by readdir(), to ensure they all point to the same device. */
+ if (n != devnum)
+ return -ENOTUNIQ;
+ }
+
+ if (!first_found)
+ return -ENOENT;
+
+ *ret = TAKE_PTR(first_found);
+ return 1; /* found */
+}
+
+int block_device_new_from_fd(int fd, BlockDeviceLookupFlag flags, sd_device **ret) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ dev_t devnum;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ r = fd_get_devnum(fd, flags, &devnum);
+ if (r < 0)
+ return r;
+
+ r = sd_device_new_from_devnum(&dev, 'b', devnum);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, BLOCK_DEVICE_LOOKUP_ORIGINATING)) {
+ _cleanup_(sd_device_unrefp) sd_device *dev_origin = NULL;
+ sd_device *dev_whole_disk;
+
+ r = block_device_get_whole_disk(dev, &dev_whole_disk);
+ if (r < 0)
+ return r;
+
+ r = block_device_get_originating(dev_whole_disk, &dev_origin);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ if (r > 0)
+ device_unref_and_replace(dev, dev_origin);
+ }
+
+ if (FLAGS_SET(flags, BLOCK_DEVICE_LOOKUP_WHOLE_DISK)) {
+ sd_device *dev_whole_disk;
+
+ r = block_device_get_whole_disk(dev, &dev_whole_disk);
+ if (r < 0)
+ return r;
+
+ *ret = sd_device_ref(dev_whole_disk);
+ return 0;
+ }
+
+ *ret = sd_device_ref(dev);
+ return 0;
+}
+
+int block_device_new_from_path(const char *path, BlockDeviceLookupFlag flags, sd_device **ret) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(path);
+ assert(ret);
+
+ fd = open(path, O_CLOEXEC|O_PATH);
+ if (fd < 0)
+ return -errno;
+
+ return block_device_new_from_fd(fd, flags, ret);
+}
+
+int block_get_whole_disk(dev_t d, dev_t *ret) {
+ char p[SYS_BLOCK_PATH_MAX("/partition")];
+ _cleanup_free_ char *s = NULL;
+ dev_t devt;
+ int r;
+
+ assert(ret);
+
+ if (major(d) == 0)
+ return -ENODEV;
+
+ /* If it has a queue this is good enough for us */
+ xsprintf_sys_block_path(p, "/queue", d);
+ if (access(p, F_OK) >= 0) {
+ *ret = d;
+ return 0;
+ }
+ if (errno != ENOENT)
+ return -errno;
+
+ /* If it is a partition find the originating device */
+ xsprintf_sys_block_path(p, "/partition", d);
+ if (access(p, F_OK) < 0)
+ return -errno;
+
+ /* Get parent dev_t */
+ xsprintf_sys_block_path(p, "/../dev", d);
+ r = read_one_line_file(p, &s);
+ if (r < 0)
+ return r;
+
+ r = parse_devnum(s, &devt);
+ if (r < 0)
+ return r;
+
+ /* Only return this if it is really good enough for us. */
+ xsprintf_sys_block_path(p, "/queue", devt);
+ if (access(p, F_OK) < 0)
+ return -errno;
+
+ *ret = devt;
+ return 1;
+}
+
+int get_block_device_fd(int fd, dev_t *ret) {
+ struct stat st;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ /* Gets the block device directly backing a file system. If the block device is encrypted, returns
+ * the device mapper block device. */
+
+ if (fstat(fd, &st))
+ return -errno;
+
+ if (major(st.st_dev) != 0) {
+ *ret = st.st_dev;
+ return 1;
+ }
+
+ r = btrfs_get_block_device_fd(fd, ret);
+ if (r > 0)
+ return 1;
+ if (r != -ENOTTY) /* not btrfs */
+ return r;
+
+ *ret = 0;
+ return 0;
+}
+
+int get_block_device(const char *path, dev_t *ret) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(path);
+ assert(ret);
+
+ fd = open(path, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ return get_block_device_fd(fd, ret);
+}
+
+int block_get_originating(dev_t dt, dev_t *ret) {
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL, *origin = NULL;
+ int r;
+
+ assert(ret);
+
+ r = sd_device_new_from_devnum(&dev, 'b', dt);
+ if (r < 0)
+ return r;
+
+ r = block_device_get_originating(dev, &origin);
+ if (r < 0)
+ return r;
+
+ return sd_device_get_devnum(origin, ret);
+}
+
+int get_block_device_harder_fd(int fd, dev_t *ret) {
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ /* Gets the backing block device for a file system, and handles LUKS encrypted file systems, looking for its
+ * immediate parent, if there is one. */
+
+ r = get_block_device_fd(fd, ret);
+ if (r <= 0)
+ return r;
+
+ r = block_get_originating(*ret, ret);
+ if (r < 0)
+ log_debug_errno(r, "Failed to chase block device, ignoring: %m");
+
+ return 1;
+}
+
+int get_block_device_harder(const char *path, dev_t *ret) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(path);
+ assert(ret);
+
+ fd = open(path, O_RDONLY|O_NOFOLLOW|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ return get_block_device_harder_fd(fd, ret);
+}
+
+int lock_whole_block_device(dev_t devt, int operation) {
+ _cleanup_close_ int lock_fd = -EBADF;
+ dev_t whole_devt;
+ int r;
+
+ /* Let's get a BSD file lock on the whole block device, as per: https://systemd.io/BLOCK_DEVICE_LOCKING */
+
+ r = block_get_whole_disk(devt, &whole_devt);
+ if (r < 0)
+ return r;
+
+ lock_fd = r = device_open_from_devnum(S_IFBLK, whole_devt, O_RDONLY|O_CLOEXEC|O_NONBLOCK, NULL);
+ if (r < 0)
+ return r;
+
+ if (flock(lock_fd, operation) < 0)
+ return -errno;
+
+ return TAKE_FD(lock_fd);
+}
+
+int blockdev_partscan_enabled(int fd) {
+ _cleanup_free_ char *p = NULL, *buf = NULL;
+ unsigned long long ull;
+ struct stat st;
+ int r;
+
+ /* Checks if partition scanning is correctly enabled on the block device */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISBLK(st.st_mode))
+ return -ENOTBLK;
+
+ if (asprintf(&p, "/sys/dev/block/%u:%u/capability", major(st.st_rdev), minor(st.st_rdev)) < 0)
+ return -ENOMEM;
+
+ r = read_one_line_file(p, &buf);
+ if (r == -ENOENT) /* If the capability file doesn't exist then we are most likely looking at a
+ * partition block device, not the whole block device. And that means we have no
+ * partition scanning on for it (we do for its parent, but not for the partition
+ * itself). */
+ return false;
+ if (r < 0)
+ return r;
+
+ r = safe_atollu_full(buf, 16, &ull);
+ if (r < 0)
+ return r;
+
+#ifndef GENHD_FL_NO_PART_SCAN
+#define GENHD_FL_NO_PART_SCAN (0x0200)
+#endif
+
+ return !FLAGS_SET(ull, GENHD_FL_NO_PART_SCAN);
+}
+
+static int blockdev_is_encrypted(const char *sysfs_path, unsigned depth_left) {
+ _cleanup_free_ char *p = NULL, *uuids = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ int r, found_encrypted = false;
+
+ assert(sysfs_path);
+
+ if (depth_left == 0)
+ return -EINVAL;
+
+ p = path_join(sysfs_path, "dm/uuid");
+ if (!p)
+ return -ENOMEM;
+
+ r = read_one_line_file(p, &uuids);
+ if (r != -ENOENT) {
+ if (r < 0)
+ return r;
+
+ /* The DM device's uuid attribute is prefixed with "CRYPT-" if this is a dm-crypt device. */
+ if (startswith(uuids, "CRYPT-"))
+ return true;
+ }
+
+ /* Not a dm-crypt device itself. But maybe it is on top of one? Follow the links in the "slaves/"
+ * subdir. */
+
+ p = mfree(p);
+ p = path_join(sysfs_path, "slaves");
+ if (!p)
+ return -ENOMEM;
+
+ d = opendir(p);
+ if (!d) {
+ if (errno == ENOENT) /* Doesn't have underlying devices */
+ return false;
+
+ return -errno;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *q = NULL;
+ struct dirent *de;
+
+ errno = 0;
+ de = readdir_no_dot(d);
+ if (!de) {
+ if (errno != 0)
+ return -errno;
+
+ break; /* No more underlying devices */
+ }
+
+ q = path_join(p, de->d_name);
+ if (!q)
+ return -ENOMEM;
+
+ r = blockdev_is_encrypted(q, depth_left - 1);
+ if (r < 0)
+ return r;
+ if (r == 0) /* we found one that is not encrypted? then propagate that immediately */
+ return false;
+
+ found_encrypted = true;
+ }
+
+ return found_encrypted;
+}
+
+int fd_is_encrypted(int fd) {
+ char p[SYS_BLOCK_PATH_MAX(NULL)];
+ dev_t devt;
+ int r;
+
+ r = get_block_device_fd(fd, &devt);
+ if (r < 0)
+ return r;
+ if (r == 0) /* doesn't have a block device */
+ return false;
+
+ xsprintf_sys_block_path(p, NULL, devt);
+
+ return blockdev_is_encrypted(p, 10 /* safety net: maximum recursion depth */);
+}
+
+int path_is_encrypted(const char *path) {
+ char p[SYS_BLOCK_PATH_MAX(NULL)];
+ dev_t devt;
+ int r;
+
+ r = get_block_device(path, &devt);
+ if (r < 0)
+ return r;
+ if (r == 0) /* doesn't have a block device */
+ return false;
+
+ xsprintf_sys_block_path(p, NULL, devt);
+
+ return blockdev_is_encrypted(p, 10 /* safety net: maximum recursion depth */);
+}
+
+int fd_get_whole_disk(int fd, bool backing, dev_t *ret) {
+ dev_t devt;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ r = fd_get_devnum(fd, backing ? BLOCK_DEVICE_LOOKUP_BACKING : 0, &devt);
+ if (r < 0)
+ return r;
+
+ return block_get_whole_disk(devt, ret);
+}
+
+int path_get_whole_disk(const char *path, bool backing, dev_t *ret) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_CLOEXEC|O_PATH);
+ if (fd < 0)
+ return -errno;
+
+ return fd_get_whole_disk(fd, backing, ret);
+}
+
+int block_device_add_partition(
+ int fd,
+ const char *name,
+ int nr,
+ uint64_t start,
+ uint64_t size) {
+
+ assert(fd >= 0);
+ assert(name);
+ assert(nr > 0);
+
+ struct blkpg_partition bp = {
+ .pno = nr,
+ .start = start,
+ .length = size,
+ };
+
+ struct blkpg_ioctl_arg ba = {
+ .op = BLKPG_ADD_PARTITION,
+ .data = &bp,
+ .datalen = sizeof(bp),
+ };
+
+ if (strlen(name) >= sizeof(bp.devname))
+ return -EINVAL;
+
+ strcpy(bp.devname, name);
+
+ return RET_NERRNO(ioctl(fd, BLKPG, &ba));
+}
+
+int block_device_remove_partition(
+ int fd,
+ const char *name,
+ int nr) {
+
+ assert(fd >= 0);
+ assert(name);
+ assert(nr > 0);
+
+ struct blkpg_partition bp = {
+ .pno = nr,
+ };
+
+ struct blkpg_ioctl_arg ba = {
+ .op = BLKPG_DEL_PARTITION,
+ .data = &bp,
+ .datalen = sizeof(bp),
+ };
+
+ if (strlen(name) >= sizeof(bp.devname))
+ return -EINVAL;
+
+ strcpy(bp.devname, name);
+
+ return RET_NERRNO(ioctl(fd, BLKPG, &ba));
+}
+
+int block_device_resize_partition(
+ int fd,
+ int nr,
+ uint64_t start,
+ uint64_t size) {
+
+ assert(fd >= 0);
+ assert(nr > 0);
+
+ struct blkpg_partition bp = {
+ .pno = nr,
+ .start = start,
+ .length = size,
+ };
+
+ struct blkpg_ioctl_arg ba = {
+ .op = BLKPG_RESIZE_PARTITION,
+ .data = &bp,
+ .datalen = sizeof(bp),
+ };
+
+ return RET_NERRNO(ioctl(fd, BLKPG, &ba));
+}
+
+int partition_enumerator_new(sd_device *dev, sd_device_enumerator **ret) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ const char *s;
+ int r;
+
+ assert(dev);
+ assert(ret);
+
+ /* Refuse invocation on partition block device, insist on "whole" device */
+ r = block_device_is_whole_disk(dev);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENXIO; /* return a recognizable error */
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_allow_uninitialized(e);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_parent(e, dev);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_sysname(dev, &s);
+ if (r < 0)
+ return r;
+
+ /* Also add sysname check for safety. Hopefully, this also improves performance. */
+ s = strjoina(s, "*");
+ r = sd_device_enumerator_add_match_sysname(e, s);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_subsystem(e, "block", /* match = */ true);
+ if (r < 0)
+ return r;
+
+ r = sd_device_enumerator_add_match_property(e, "DEVTYPE", "partition");
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(e);
+ return 0;
+}
+
+int block_device_remove_all_partitions(sd_device *dev, int fd) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ _cleanup_(sd_device_unrefp) sd_device *dev_unref = NULL;
+ _cleanup_close_ int fd_close = -EBADF;
+ bool has_partitions = false;
+ int r, k = 0;
+
+ assert(dev || fd >= 0);
+
+ if (!dev) {
+ r = block_device_new_from_fd(fd, 0, &dev_unref);
+ if (r < 0)
+ return r;
+
+ dev = dev_unref;
+ }
+
+ r = partition_enumerator_new(dev, &e);
+ if (r < 0)
+ return r;
+
+ if (fd < 0) {
+ fd_close = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY);
+ if (fd_close < 0)
+ return fd_close;
+
+ fd = fd_close;
+ }
+
+ FOREACH_DEVICE(e, part) {
+ const char *v, *devname;
+ int nr;
+
+ has_partitions = true;
+
+ r = sd_device_get_devname(part, &devname);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_property_value(part, "PARTN", &v);
+ if (r < 0)
+ return r;
+
+ r = safe_atoi(v, &nr);
+ if (r < 0)
+ return r;
+
+ r = btrfs_forget_device(devname);
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to forget btrfs device %s, ignoring: %m", devname);
+
+ r = block_device_remove_partition(fd, devname, nr);
+ if (r == -ENODEV) {
+ log_debug("Kernel removed partition %s before us, ignoring", devname);
+ continue;
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Failed to remove partition %s: %m", devname);
+ k = k < 0 ? k : r;
+ continue;
+ }
+
+ log_debug("Removed partition %s", devname);
+ }
+
+ return k < 0 ? k : has_partitions;
+}
+
+int block_device_has_partitions(sd_device *dev) {
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ int r;
+
+ assert(dev);
+
+ /* Checks if the specified device currently has partitions. */
+
+ r = partition_enumerator_new(dev, &e);
+ if (r < 0)
+ return r;
+
+ return !!sd_device_enumerator_get_device_first(e);
+}
+
+int blockdev_reread_partition_table(sd_device *dev) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(dev);
+
+ /* Try to re-read the partition table. This only succeeds if none of the devices is busy. */
+
+ fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (fd < 0)
+ return fd;
+
+ if (flock(fd, LOCK_EX|LOCK_NB) < 0)
+ return -errno;
+
+ if (ioctl(fd, BLKRRPART, 0) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int blockdev_get_sector_size(int fd, uint32_t *ret) {
+ int ssz = 0;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (ioctl(fd, BLKSSZGET, &ssz) < 0)
+ return -errno;
+ if (ssz <= 0) /* make sure the field is initialized */
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Block device reported invalid sector size %i.", ssz);
+
+ *ret = ssz;
+ return 0;
+}
+
+int blockdev_get_root(int level, dev_t *ret) {
+ _cleanup_free_ char *p = NULL;
+ dev_t devno;
+ int r;
+
+ /* Returns the device node backing the root file system. Traces through
+ * dm-crypt/dm-verity/... Returns > 0 and the devno of the device on success. If there's no block
+ * device (or multiple) returns 0 and a devno of 0. Failure otherwise.
+ *
+ * If the root mount has been replaced by some form of volatile file system (overlayfs), the original
+ * root block device node is symlinked in /run/systemd/volatile-root. Let's read that here. */
+ r = readlink_malloc("/run/systemd/volatile-root", &p);
+ if (r == -ENOENT) { /* volatile-root not found */
+ r = get_block_device_harder("/", &devno);
+ if (r == -EUCLEAN)
+ return btrfs_log_dev_root(level, r, "root file system");
+ if (r < 0)
+ return log_full_errno(level, r, "Failed to determine block device of root file system: %m");
+ if (r == 0) { /* Not backed by a single block device. (Could be NFS or so, or could be multi-device RAID or so) */
+ r = get_block_device_harder("/usr", &devno);
+ if (r == -EUCLEAN)
+ return btrfs_log_dev_root(level, r, "/usr");
+ if (r < 0)
+ return log_full_errno(level, r, "Failed to determine block device of /usr/ file system: %m");
+ if (r == 0) { /* /usr/ not backed by single block device, either. */
+ log_debug("Neither root nor /usr/ file system are on a (single) block device.");
+
+ if (ret)
+ *ret = 0;
+
+ return 0;
+ }
+ }
+ } else if (r < 0)
+ return log_full_errno(level, r, "Failed to read symlink /run/systemd/volatile-root: %m");
+ else {
+ mode_t m;
+ r = device_path_parse_major_minor(p, &m, &devno);
+ if (r < 0)
+ return log_full_errno(level, r, "Failed to parse major/minor device node: %m");
+ if (!S_ISBLK(m))
+ return log_full_errno(level, SYNTHETIC_ERRNO(ENOTBLK), "Volatile root device is of wrong type.");
+ }
+
+ if (ret)
+ *ret = devno;
+
+ return 1;
+}
diff --git a/src/shared/blockdev-util.h b/src/shared/blockdev-util.h
new file mode 100644
index 0000000..954a23d
--- /dev/null
+++ b/src/shared/blockdev-util.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "sd-device.h"
+
+#include "macro.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+#define SYS_BLOCK_PATH_MAX(suffix) \
+ (STRLEN("/sys/dev/block/") + DECIMAL_STR_MAX(dev_t) + 1 + DECIMAL_STR_MAX(dev_t) + strlen_ptr(suffix))
+#define xsprintf_sys_block_path(buf, suffix, devno) \
+ xsprintf(buf, "/sys/dev/block/%u:%u%s", major(devno), minor(devno), strempty(suffix))
+
+typedef enum BlockDeviceLookupFlag {
+ BLOCK_DEVICE_LOOKUP_WHOLE_DISK = 1 << 0, /* whole block device, e.g. sda, nvme0n1, or loop0. */
+ BLOCK_DEVICE_LOOKUP_BACKING = 1 << 1, /* fd may be regular file or directory on file system, in
+ * which case backing block device is determined. */
+ BLOCK_DEVICE_LOOKUP_ORIGINATING = 1 << 2, /* Try to find the underlying layer device for stacked
+ * block device, e.g. LUKS-style DM. */
+} BlockDeviceLookupFlag;
+
+int block_device_new_from_fd(int fd, BlockDeviceLookupFlag flag, sd_device **ret);
+int block_device_new_from_path(const char *path, BlockDeviceLookupFlag flag, sd_device **ret);
+
+int block_device_is_whole_disk(sd_device *dev);
+int block_device_get_whole_disk(sd_device *dev, sd_device **ret);
+int block_device_get_originating(sd_device *dev, sd_device **ret);
+
+int block_get_whole_disk(dev_t d, dev_t *ret);
+int block_get_originating(dev_t d, dev_t *ret);
+
+int get_block_device_fd(int fd, dev_t *ret);
+int get_block_device(const char *path, dev_t *dev);
+
+int get_block_device_harder_fd(int fd, dev_t *dev);
+int get_block_device_harder(const char *path, dev_t *dev);
+
+int lock_whole_block_device(dev_t devt, int operation);
+
+int blockdev_partscan_enabled(int fd);
+
+int fd_is_encrypted(int fd);
+int path_is_encrypted(const char *path);
+
+int fd_get_whole_disk(int fd, bool backing, dev_t *ret);
+int path_get_whole_disk(const char *path, bool backing, dev_t *ret);
+
+int block_device_add_partition(int fd, const char *name, int nr, uint64_t start, uint64_t size);
+int block_device_remove_partition(int fd, const char *name, int nr);
+int block_device_resize_partition(int fd, int nr, uint64_t start, uint64_t size);
+int partition_enumerator_new(sd_device *dev, sd_device_enumerator **ret);
+int block_device_remove_all_partitions(sd_device *dev, int fd);
+int block_device_has_partitions(sd_device *dev);
+int blockdev_reread_partition_table(sd_device *dev);
+
+int blockdev_get_sector_size(int fd, uint32_t *ret);
+
+int blockdev_get_root(int level, dev_t *ret);
diff --git a/src/shared/bond-util.c b/src/shared/bond-util.c
new file mode 100644
index 0000000..e04b201
--- /dev/null
+++ b/src/shared/bond-util.c
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bond-util.h"
+#include "string-table.h"
+
+static const char* const bond_mode_table[_NETDEV_BOND_MODE_MAX] = {
+ [NETDEV_BOND_MODE_BALANCE_RR] = "balance-rr",
+ [NETDEV_BOND_MODE_ACTIVE_BACKUP] = "active-backup",
+ [NETDEV_BOND_MODE_BALANCE_XOR] = "balance-xor",
+ [NETDEV_BOND_MODE_BROADCAST] = "broadcast",
+ [NETDEV_BOND_MODE_802_3AD] = "802.3ad",
+ [NETDEV_BOND_MODE_BALANCE_TLB] = "balance-tlb",
+ [NETDEV_BOND_MODE_BALANCE_ALB] = "balance-alb",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_mode, BondMode);
+
+static const char* const bond_xmit_hash_policy_table[_NETDEV_BOND_XMIT_HASH_POLICY_MAX] = {
+ [NETDEV_BOND_XMIT_HASH_POLICY_LAYER2] = "layer2",
+ [NETDEV_BOND_XMIT_HASH_POLICY_LAYER34] = "layer3+4",
+ [NETDEV_BOND_XMIT_HASH_POLICY_LAYER23] = "layer2+3",
+ [NETDEV_BOND_XMIT_HASH_POLICY_ENCAP23] = "encap2+3",
+ [NETDEV_BOND_XMIT_HASH_POLICY_ENCAP34] = "encap3+4",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_xmit_hash_policy, BondXmitHashPolicy);
+
+static const char* const bond_lacp_rate_table[_NETDEV_BOND_LACP_RATE_MAX] = {
+ [NETDEV_BOND_LACP_RATE_SLOW] = "slow",
+ [NETDEV_BOND_LACP_RATE_FAST] = "fast",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_lacp_rate, BondLacpRate);
+
+static const char* const bond_ad_select_table[_NETDEV_BOND_AD_SELECT_MAX] = {
+ [NETDEV_BOND_AD_SELECT_STABLE] = "stable",
+ [NETDEV_BOND_AD_SELECT_BANDWIDTH] = "bandwidth",
+ [NETDEV_BOND_AD_SELECT_COUNT] = "count",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_ad_select, BondAdSelect);
+
+static const char* const bond_fail_over_mac_table[_NETDEV_BOND_FAIL_OVER_MAC_MAX] = {
+ [NETDEV_BOND_FAIL_OVER_MAC_NONE] = "none",
+ [NETDEV_BOND_FAIL_OVER_MAC_ACTIVE] = "active",
+ [NETDEV_BOND_FAIL_OVER_MAC_FOLLOW] = "follow",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_fail_over_mac, BondFailOverMac);
+
+static const char *const bond_arp_validate_table[_NETDEV_BOND_ARP_VALIDATE_MAX] = {
+ [NETDEV_BOND_ARP_VALIDATE_NONE] = "none",
+ [NETDEV_BOND_ARP_VALIDATE_ACTIVE]= "active",
+ [NETDEV_BOND_ARP_VALIDATE_BACKUP]= "backup",
+ [NETDEV_BOND_ARP_VALIDATE_ALL]= "all",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_arp_validate, BondArpValidate);
+
+static const char *const bond_arp_all_targets_table[_NETDEV_BOND_ARP_ALL_TARGETS_MAX] = {
+ [NETDEV_BOND_ARP_ALL_TARGETS_ANY] = "any",
+ [NETDEV_BOND_ARP_ALL_TARGETS_ALL] = "all",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_arp_all_targets, BondArpAllTargets);
+
+static const char *const bond_primary_reselect_table[_NETDEV_BOND_PRIMARY_RESELECT_MAX] = {
+ [NETDEV_BOND_PRIMARY_RESELECT_ALWAYS] = "always",
+ [NETDEV_BOND_PRIMARY_RESELECT_BETTER]= "better",
+ [NETDEV_BOND_PRIMARY_RESELECT_FAILURE]= "failure",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bond_primary_reselect, BondPrimaryReselect);
diff --git a/src/shared/bond-util.h b/src/shared/bond-util.h
new file mode 100644
index 0000000..9e693b1
--- /dev/null
+++ b/src/shared/bond-util.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <netinet/in.h>
+#include <linux/if_bonding.h>
+
+#include "macro.h"
+
+/*
+ * Maximum number of targets supported by the kernel for a single
+ * bond netdev.
+ */
+#define NETDEV_BOND_ARP_TARGETS_MAX 16
+
+typedef enum BondMode {
+ NETDEV_BOND_MODE_BALANCE_RR = BOND_MODE_ROUNDROBIN,
+ NETDEV_BOND_MODE_ACTIVE_BACKUP = BOND_MODE_ACTIVEBACKUP,
+ NETDEV_BOND_MODE_BALANCE_XOR = BOND_MODE_XOR,
+ NETDEV_BOND_MODE_BROADCAST = BOND_MODE_BROADCAST,
+ NETDEV_BOND_MODE_802_3AD = BOND_MODE_8023AD,
+ NETDEV_BOND_MODE_BALANCE_TLB = BOND_MODE_TLB,
+ NETDEV_BOND_MODE_BALANCE_ALB = BOND_MODE_ALB,
+ _NETDEV_BOND_MODE_MAX,
+ _NETDEV_BOND_MODE_INVALID = -EINVAL,
+} BondMode;
+
+typedef enum BondXmitHashPolicy {
+ NETDEV_BOND_XMIT_HASH_POLICY_LAYER2 = BOND_XMIT_POLICY_LAYER2,
+ NETDEV_BOND_XMIT_HASH_POLICY_LAYER34 = BOND_XMIT_POLICY_LAYER34,
+ NETDEV_BOND_XMIT_HASH_POLICY_LAYER23 = BOND_XMIT_POLICY_LAYER23,
+ NETDEV_BOND_XMIT_HASH_POLICY_ENCAP23 = BOND_XMIT_POLICY_ENCAP23,
+ NETDEV_BOND_XMIT_HASH_POLICY_ENCAP34 = BOND_XMIT_POLICY_ENCAP34,
+ _NETDEV_BOND_XMIT_HASH_POLICY_MAX,
+ _NETDEV_BOND_XMIT_HASH_POLICY_INVALID = -EINVAL,
+} BondXmitHashPolicy;
+
+typedef enum BondLacpRate {
+ NETDEV_BOND_LACP_RATE_SLOW,
+ NETDEV_BOND_LACP_RATE_FAST,
+ _NETDEV_BOND_LACP_RATE_MAX,
+ _NETDEV_BOND_LACP_RATE_INVALID = -EINVAL,
+} BondLacpRate;
+
+typedef enum BondAdSelect {
+ NETDEV_BOND_AD_SELECT_STABLE,
+ NETDEV_BOND_AD_SELECT_BANDWIDTH,
+ NETDEV_BOND_AD_SELECT_COUNT,
+ _NETDEV_BOND_AD_SELECT_MAX,
+ _NETDEV_BOND_AD_SELECT_INVALID = -EINVAL,
+} BondAdSelect;
+
+typedef enum BondFailOverMac {
+ NETDEV_BOND_FAIL_OVER_MAC_NONE,
+ NETDEV_BOND_FAIL_OVER_MAC_ACTIVE,
+ NETDEV_BOND_FAIL_OVER_MAC_FOLLOW,
+ _NETDEV_BOND_FAIL_OVER_MAC_MAX,
+ _NETDEV_BOND_FAIL_OVER_MAC_INVALID = -EINVAL,
+} BondFailOverMac;
+
+typedef enum BondArpValidate {
+ NETDEV_BOND_ARP_VALIDATE_NONE,
+ NETDEV_BOND_ARP_VALIDATE_ACTIVE,
+ NETDEV_BOND_ARP_VALIDATE_BACKUP,
+ NETDEV_BOND_ARP_VALIDATE_ALL,
+ _NETDEV_BOND_ARP_VALIDATE_MAX,
+ _NETDEV_BOND_ARP_VALIDATE_INVALID = -EINVAL,
+} BondArpValidate;
+
+typedef enum BondArpAllTargets {
+ NETDEV_BOND_ARP_ALL_TARGETS_ANY,
+ NETDEV_BOND_ARP_ALL_TARGETS_ALL,
+ _NETDEV_BOND_ARP_ALL_TARGETS_MAX,
+ _NETDEV_BOND_ARP_ALL_TARGETS_INVALID = -EINVAL,
+} BondArpAllTargets;
+
+typedef enum BondPrimaryReselect {
+ NETDEV_BOND_PRIMARY_RESELECT_ALWAYS,
+ NETDEV_BOND_PRIMARY_RESELECT_BETTER,
+ NETDEV_BOND_PRIMARY_RESELECT_FAILURE,
+ _NETDEV_BOND_PRIMARY_RESELECT_MAX,
+ _NETDEV_BOND_PRIMARY_RESELECT_INVALID = -EINVAL,
+} BondPrimaryReselect;
+
+const char *bond_mode_to_string(BondMode d) _const_;
+BondMode bond_mode_from_string(const char *d) _pure_;
+
+const char *bond_xmit_hash_policy_to_string(BondXmitHashPolicy d) _const_;
+BondXmitHashPolicy bond_xmit_hash_policy_from_string(const char *d) _pure_;
+
+const char *bond_lacp_rate_to_string(BondLacpRate d) _const_;
+BondLacpRate bond_lacp_rate_from_string(const char *d) _pure_;
+
+const char *bond_fail_over_mac_to_string(BondFailOverMac d) _const_;
+BondFailOverMac bond_fail_over_mac_from_string(const char *d) _pure_;
+
+const char *bond_ad_select_to_string(BondAdSelect d) _const_;
+BondAdSelect bond_ad_select_from_string(const char *d) _pure_;
+
+const char *bond_arp_validate_to_string(BondArpValidate d) _const_;
+BondArpValidate bond_arp_validate_from_string(const char *d) _pure_;
+
+const char *bond_arp_all_targets_to_string(BondArpAllTargets d) _const_;
+BondArpAllTargets bond_arp_all_targets_from_string(const char *d) _pure_;
+
+const char *bond_primary_reselect_to_string(BondPrimaryReselect d) _const_;
+BondPrimaryReselect bond_primary_reselect_from_string(const char *d) _pure_;
diff --git a/src/shared/boot-entry.c b/src/shared/boot-entry.c
new file mode 100644
index 0000000..e726073
--- /dev/null
+++ b/src/shared/boot-entry.c
@@ -0,0 +1,273 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "boot-entry.h"
+#include "chase.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "id128-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "utf8.h"
+
+bool boot_entry_token_valid(const char *p) {
+ return utf8_is_valid(p) && string_is_safe(p) && filename_is_valid(p);
+}
+
+static int entry_token_load(int rfd, const char *etc_kernel, BootEntryTokenType *type, char **token) {
+ _cleanup_free_ char *buf = NULL, *p = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+ assert(type);
+ assert(*type == BOOT_ENTRY_TOKEN_AUTO);
+ assert(token);
+
+ if (!etc_kernel)
+ return 0;
+
+ p = path_join(etc_kernel, "entry-token");
+ if (!p)
+ return log_oom();
+
+ r = chase_and_fopenat_unlocked(rfd, p, CHASE_AT_RESOLVE_IN_ROOT, "re", NULL, &f);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to chase and open '%s': %m", p);
+
+ r = read_line(f, NAME_MAX, &buf);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read %s: %m", p);
+
+ if (isempty(buf))
+ return 0;
+
+ if (!boot_entry_token_valid(buf)) {
+ log_debug("Invalid entry token specified in %s, ignoring.", p);
+ return 0;
+ }
+
+ *token = TAKE_PTR(buf);
+ *type = BOOT_ENTRY_TOKEN_LITERAL;
+ return 1;
+}
+
+static int entry_token_from_machine_id(sd_id128_t machine_id, BootEntryTokenType *type, char **token) {
+ char *p;
+
+ assert(type);
+ assert(IN_SET(*type, BOOT_ENTRY_TOKEN_AUTO, BOOT_ENTRY_TOKEN_MACHINE_ID));
+ assert(token);
+
+ if (sd_id128_is_null(machine_id))
+ return 0;
+
+ p = strdup(SD_ID128_TO_STRING(machine_id));
+ if (!p)
+ return log_oom();
+
+ *token = p;
+ *type = BOOT_ENTRY_TOKEN_MACHINE_ID;
+ return 1;
+}
+
+static int entry_token_from_os_release(int rfd, BootEntryTokenType *type, char **token) {
+ _cleanup_free_ char *id = NULL, *image_id = NULL;
+ int r;
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+ assert(type);
+ assert(IN_SET(*type, BOOT_ENTRY_TOKEN_AUTO, BOOT_ENTRY_TOKEN_OS_IMAGE_ID, BOOT_ENTRY_TOKEN_OS_ID));
+ assert(token);
+
+ switch (*type) {
+ case BOOT_ENTRY_TOKEN_AUTO:
+ r = parse_os_release_at(rfd,
+ "IMAGE_ID", &image_id,
+ "ID", &id);
+ break;
+
+ case BOOT_ENTRY_TOKEN_OS_IMAGE_ID:
+ r = parse_os_release_at(rfd, "IMAGE_ID", &image_id);
+ break;
+
+ case BOOT_ENTRY_TOKEN_OS_ID:
+ r = parse_os_release_at(rfd, "ID", &id);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to load /etc/os-release: %m");
+
+ if (!isempty(image_id) && boot_entry_token_valid(image_id)) {
+ *token = TAKE_PTR(image_id);
+ *type = BOOT_ENTRY_TOKEN_OS_IMAGE_ID;
+ return 1;
+ }
+
+ if (!isempty(id) && boot_entry_token_valid(id)) {
+ *token = TAKE_PTR(id);
+ *type = BOOT_ENTRY_TOKEN_OS_ID;
+ return 1;
+ }
+
+ return 0;
+}
+
+int boot_entry_token_ensure_at(
+ int rfd,
+ const char *etc_kernel,
+ sd_id128_t machine_id,
+ bool machine_id_is_random,
+ BootEntryTokenType *type,
+ char **token) {
+
+ int r;
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+ assert(type);
+ assert(token);
+
+ if (*token)
+ return 0; /* Already set. */
+
+ switch (*type) {
+
+ case BOOT_ENTRY_TOKEN_AUTO:
+ r = entry_token_load(rfd, etc_kernel, type, token);
+ if (r != 0)
+ return r;
+
+ if (!machine_id_is_random) {
+ r = entry_token_from_machine_id(machine_id, type, token);
+ if (r != 0)
+ return r;
+ }
+
+ r = entry_token_from_os_release(rfd, type, token);
+ if (r != 0)
+ return r;
+
+ if (machine_id_is_random) {
+ r = entry_token_from_machine_id(machine_id, type, token);
+ if (r != 0)
+ return r;
+ }
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "No machine ID set, and /etc/os-release carries no ID=/IMAGE_ID= fields.");
+
+ case BOOT_ENTRY_TOKEN_MACHINE_ID:
+ r = entry_token_from_machine_id(machine_id, type, token);
+ if (r != 0)
+ return r;
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No machine ID set.");
+
+ case BOOT_ENTRY_TOKEN_OS_IMAGE_ID:
+ r = entry_token_from_os_release(rfd, type, token);
+ if (r != 0)
+ return r;
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "IMAGE_ID= field not set in /etc/os-release.");
+
+ case BOOT_ENTRY_TOKEN_OS_ID:
+ r = entry_token_from_os_release(rfd, type, token);
+ if (r != 0)
+ return r;
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "ID= field not set in /etc/os-release.");
+
+ case BOOT_ENTRY_TOKEN_LITERAL:
+ /* In this case, the token should be already set by the user input. */
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+int boot_entry_token_ensure(
+ const char *root,
+ const char *etc_kernel,
+ sd_id128_t machine_id,
+ bool machine_id_is_random,
+ BootEntryTokenType *type,
+ char **token) {
+
+ assert(token);
+
+ if (*token)
+ return 0; /* Already set. */
+
+ _cleanup_close_ int rfd = -EBADF;
+
+ rfd = open(empty_to_root(root), O_CLOEXEC | O_DIRECTORY | O_PATH);
+ if (rfd < 0)
+ return -errno;
+
+ return boot_entry_token_ensure_at(rfd, etc_kernel, machine_id, machine_id_is_random, type, token);
+}
+
+int parse_boot_entry_token_type(const char *s, BootEntryTokenType *type, char **token) {
+ assert(s);
+ assert(type);
+ assert(token);
+
+ /*
+ * This function is intended to be used in command line parsers, to handle token that are passed in.
+ *
+ * NOTE THAT THIS WILL FREE THE PREVIOUS ARGUMENT POINTER ON SUCCESS!
+ * Hence, do not pass in uninitialized pointers.
+ */
+
+ if (streq(s, "machine-id")) {
+ *type = BOOT_ENTRY_TOKEN_MACHINE_ID;
+ *token = mfree(*token);
+ return 0;
+ }
+
+ if (streq(s, "os-image-id")) {
+ *type = BOOT_ENTRY_TOKEN_OS_IMAGE_ID;
+ *token = mfree(*token);
+ return 0;
+ }
+
+ if (streq(s, "os-id")) {
+ *type = BOOT_ENTRY_TOKEN_OS_ID;
+ *token = mfree(*token);
+ return 0;
+ }
+
+ const char *e = startswith(s, "literal:");
+ if (e) {
+ if (!boot_entry_token_valid(e))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid entry token literal is specified for --entry-token=.");
+
+ *type = BOOT_ENTRY_TOKEN_LITERAL;
+ return free_and_strdup_warn(token, e);
+ }
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Unexpected parameter for --entry-token=: %s", s);
+}
+
+static const char *const boot_entry_token_type_table[] = {
+ [BOOT_ENTRY_TOKEN_MACHINE_ID] = "machine-id",
+ [BOOT_ENTRY_TOKEN_OS_IMAGE_ID] = "os-image-id",
+ [BOOT_ENTRY_TOKEN_OS_ID] = "os-id",
+ [BOOT_ENTRY_TOKEN_LITERAL] = "literal",
+ [BOOT_ENTRY_TOKEN_AUTO] = "auto",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(boot_entry_token_type, BootEntryTokenType);
diff --git a/src/shared/boot-entry.h b/src/shared/boot-entry.h
new file mode 100644
index 0000000..f3a6f28
--- /dev/null
+++ b/src/shared/boot-entry.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-id128.h"
+
+typedef enum BootEntryTokenType {
+ BOOT_ENTRY_TOKEN_MACHINE_ID,
+ BOOT_ENTRY_TOKEN_OS_IMAGE_ID,
+ BOOT_ENTRY_TOKEN_OS_ID,
+ BOOT_ENTRY_TOKEN_LITERAL,
+ BOOT_ENTRY_TOKEN_AUTO,
+} BootEntryTokenType;
+
+bool boot_entry_token_valid(const char *p);
+
+int boot_entry_token_ensure(
+ const char *root,
+ const char *etc_kernel, /* will be prefixed with root, typically /etc/kernel. */
+ sd_id128_t machine_id,
+ bool machine_id_is_random,
+ BootEntryTokenType *type, /* input and output */
+ char **token); /* output, but do not pass uninitialized value. */
+int boot_entry_token_ensure_at(
+ int rfd,
+ const char *etc_kernel,
+ sd_id128_t machine_id,
+ bool machine_id_is_random,
+ BootEntryTokenType *type,
+ char **token);
+
+int parse_boot_entry_token_type(const char *s, BootEntryTokenType *type, char **token);
+
+const char* boot_entry_token_type_to_string(BootEntryTokenType t);
diff --git a/src/shared/boot-timestamps.c b/src/shared/boot-timestamps.c
new file mode 100644
index 0000000..e49bd8f
--- /dev/null
+++ b/src/shared/boot-timestamps.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "acpi-fpdt.h"
+#include "boot-timestamps.h"
+#include "efi-loader.h"
+#include "macro.h"
+#include "time-util.h"
+
+int boot_timestamps(const dual_timestamp *n, dual_timestamp *firmware, dual_timestamp *loader) {
+ usec_t x = 0, y = 0, a;
+ int r;
+ dual_timestamp _n;
+
+ assert(firmware);
+ assert(loader);
+
+ if (!n) {
+ dual_timestamp_now(&_n);
+ n = &_n;
+ }
+
+ r = acpi_get_boot_usec(&x, &y);
+ if (r < 0) {
+ r = efi_loader_get_boot_usec(&x, &y);
+ if (r < 0)
+ return r;
+ }
+
+ /* Let's convert this to timestamps where the firmware
+ * began/loader began working. To make this more confusing:
+ * since usec_t is unsigned and the kernel's monotonic clock
+ * begins at kernel initialization we'll actually initialize
+ * the monotonic timestamps here as negative of the actual
+ * value. */
+
+ firmware->monotonic = y;
+ loader->monotonic = y - x;
+
+ a = n->monotonic + firmware->monotonic;
+ firmware->realtime = n->realtime > a ? n->realtime - a : 0;
+
+ a = n->monotonic + loader->monotonic;
+ loader->realtime = n->realtime > a ? n->realtime - a : 0;
+
+ return 0;
+}
diff --git a/src/shared/boot-timestamps.h b/src/shared/boot-timestamps.h
new file mode 100644
index 0000000..55b7ad1
--- /dev/null
+++ b/src/shared/boot-timestamps.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <time-util.h>
+
+int boot_timestamps(const dual_timestamp *n, dual_timestamp *firmware, dual_timestamp *loader);
diff --git a/src/shared/bootspec.c b/src/shared/bootspec.c
new file mode 100644
index 0000000..f4b2fdc
--- /dev/null
+++ b/src/shared/bootspec.c
@@ -0,0 +1,1434 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "bootspec-fundamental.h"
+#include "bootspec.h"
+#include "chase.h"
+#include "conf-files.h"
+#include "devnum-util.h"
+#include "dirent-util.h"
+#include "efi-loader.h"
+#include "env-file.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "find-esp.h"
+#include "path-util.h"
+#include "pe-binary.h"
+#include "pretty-print.h"
+#include "recurse-dir.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "unaligned.h"
+
+static const char* const boot_entry_type_table[_BOOT_ENTRY_TYPE_MAX] = {
+ [BOOT_ENTRY_CONF] = "Boot Loader Specification Type #1 (.conf)",
+ [BOOT_ENTRY_UNIFIED] = "Boot Loader Specification Type #2 (.efi)",
+ [BOOT_ENTRY_LOADER] = "Reported by Boot Loader",
+ [BOOT_ENTRY_LOADER_AUTO] = "Automatic",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(boot_entry_type, BootEntryType);
+
+static const char* const boot_entry_type_json_table[_BOOT_ENTRY_TYPE_MAX] = {
+ [BOOT_ENTRY_CONF] = "type1",
+ [BOOT_ENTRY_UNIFIED] = "type2",
+ [BOOT_ENTRY_LOADER] = "loader",
+ [BOOT_ENTRY_LOADER_AUTO] = "auto",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(boot_entry_type_json, BootEntryType);
+
+static void boot_entry_free(BootEntry *entry) {
+ assert(entry);
+
+ free(entry->id);
+ free(entry->id_old);
+ free(entry->path);
+ free(entry->root);
+ free(entry->title);
+ free(entry->show_title);
+ free(entry->sort_key);
+ free(entry->version);
+ free(entry->machine_id);
+ free(entry->architecture);
+ strv_free(entry->options);
+ free(entry->kernel);
+ free(entry->efi);
+ strv_free(entry->initrd);
+ free(entry->device_tree);
+ strv_free(entry->device_tree_overlay);
+}
+
+static int mangle_path(
+ const char *fname,
+ unsigned line,
+ const char *field,
+ const char *p,
+ char **ret) {
+
+ _cleanup_free_ char *c = NULL;
+
+ assert(field);
+ assert(p);
+ assert(ret);
+
+ /* Spec leaves open if prefixed with "/" or not, let's normalize that */
+ if (path_is_absolute(p))
+ c = strdup(p);
+ else
+ c = strjoin("/", p);
+ if (!c)
+ return -ENOMEM;
+
+ /* We only reference files, never directories */
+ if (endswith(c, "/")) {
+ log_syntax(NULL, LOG_WARNING, fname, line, 0, "Path in field '%s' has trailing slash, ignoring: %s", field, c);
+ *ret = NULL;
+ return 0;
+ }
+
+ /* Remove duplicate "/" */
+ path_simplify(c);
+
+ /* No ".." or "." or so */
+ if (!path_is_normalized(c)) {
+ log_syntax(NULL, LOG_WARNING, fname, line, 0, "Path in field '%s' is not normalized, ignoring: %s", field, c);
+ *ret = NULL;
+ return 0;
+ }
+
+ *ret = TAKE_PTR(c);
+ return 1;
+}
+
+static int parse_path_one(
+ const char *fname,
+ unsigned line,
+ const char *field,
+ char **s,
+ const char *p) {
+
+ _cleanup_free_ char *c = NULL;
+ int r;
+
+ assert(field);
+ assert(s);
+ assert(p);
+
+ r = mangle_path(fname, line, field, p, &c);
+ if (r <= 0)
+ return r;
+
+ return free_and_replace(*s, c);
+}
+
+static int parse_path_strv(
+ const char *fname,
+ unsigned line,
+ const char *field,
+ char ***s,
+ const char *p) {
+
+ char *c;
+ int r;
+
+ assert(field);
+ assert(s);
+ assert(p);
+
+ r = mangle_path(fname, line, field, p, &c);
+ if (r <= 0)
+ return r;
+
+ return strv_consume(s, c);
+}
+
+static int parse_path_many(
+ const char *fname,
+ unsigned line,
+ const char *field,
+ char ***s,
+ const char *p) {
+
+ _cleanup_strv_free_ char **l = NULL, **f = NULL;
+ int r;
+
+ l = strv_split(p, NULL);
+ if (!l)
+ return -ENOMEM;
+
+ STRV_FOREACH(i, l) {
+ char *c;
+
+ r = mangle_path(fname, line, field, *i, &c);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+
+ r = strv_consume(&f, c);
+ if (r < 0)
+ return r;
+ }
+
+ return strv_extend_strv(s, f, /* filter_duplicates= */ false);
+}
+
+static int parse_tries(const char *fname, const char **p, unsigned *ret) {
+ _cleanup_free_ char *d = NULL;
+ unsigned tries;
+ size_t n;
+ int r;
+
+ assert(fname);
+ assert(p);
+ assert(*p);
+ assert(ret);
+
+ n = strspn(*p, DIGITS);
+ if (n == 0) {
+ *ret = UINT_MAX;
+ return 0;
+ }
+
+ d = strndup(*p, n);
+ if (!d)
+ return log_oom();
+
+ r = safe_atou_full(d, 10, &tries);
+ if (r >= 0 && tries > INT_MAX) /* sd-boot allows INT_MAX, let's use the same limit */
+ r = -ERANGE;
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse tries counter of filename '%s': %m", fname);
+
+ *p = *p + n;
+ *ret = tries;
+ return 1;
+}
+
+int boot_filename_extract_tries(
+ const char *fname,
+ char **ret_stripped,
+ unsigned *ret_tries_left,
+ unsigned *ret_tries_done) {
+
+ unsigned tries_left = UINT_MAX, tries_done = UINT_MAX;
+ _cleanup_free_ char *stripped = NULL;
+ const char *p, *suffix, *m;
+ int r;
+
+ assert(fname);
+ assert(ret_stripped);
+ assert(ret_tries_left);
+ assert(ret_tries_done);
+
+ /* Be liberal with suffix, only insist on a dot. After all we want to cover any capitalization here
+ * (vfat is case insensitive after all), and at least .efi and .conf as suffix. */
+ suffix = strrchr(fname, '.');
+ if (!suffix)
+ goto nothing;
+
+ p = m = memrchr(fname, '+', suffix - fname);
+ if (!p)
+ goto nothing;
+ p++;
+
+ r = parse_tries(fname, &p, &tries_left);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ goto nothing;
+
+ if (*p == '-') {
+ p++;
+
+ r = parse_tries(fname, &p, &tries_done);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ goto nothing;
+ }
+
+ if (p != suffix)
+ goto nothing;
+
+ stripped = strndup(fname, m - fname);
+ if (!stripped)
+ return log_oom();
+
+ if (!strextend(&stripped, suffix))
+ return log_oom();
+
+ *ret_stripped = TAKE_PTR(stripped);
+ *ret_tries_left = tries_left;
+ *ret_tries_done = tries_done;
+
+ return 0;
+
+nothing:
+ stripped = strdup(fname);
+ if (!stripped)
+ return log_oom();
+
+ *ret_stripped = TAKE_PTR(stripped);
+ *ret_tries_left = *ret_tries_done = UINT_MAX;
+ return 0;
+}
+
+static int boot_entry_load_type1(
+ FILE *f,
+ const char *root,
+ const char *dir,
+ const char *fname,
+ BootEntry *entry) {
+
+ _cleanup_(boot_entry_free) BootEntry tmp = BOOT_ENTRY_INIT(BOOT_ENTRY_CONF);
+ unsigned line = 1;
+ char *c;
+ int r;
+
+ assert(f);
+ assert(root);
+ assert(dir);
+ assert(fname);
+ assert(entry);
+
+ /* Loads a Type #1 boot menu entry from the specified FILE* object */
+
+ r = boot_filename_extract_tries(fname, &tmp.id, &tmp.tries_left, &tmp.tries_done);
+ if (r < 0)
+ return r;
+
+ if (!efi_loader_entry_name_valid(tmp.id))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid loader entry name: %s", fname);
+
+ c = endswith_no_case(tmp.id, ".conf");
+ if (!c)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid loader entry file suffix: %s", fname);
+
+ tmp.id_old = strndup(tmp.id, c - tmp.id); /* Without .conf suffix */
+ if (!tmp.id_old)
+ return log_oom();
+
+ tmp.path = path_join(dir, fname);
+ if (!tmp.path)
+ return log_oom();
+
+ tmp.root = strdup(root);
+ if (!tmp.root)
+ return log_oom();
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL, *field = NULL;
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &buf);
+ if (r == 0)
+ break;
+ if (r == -ENOBUFS)
+ return log_syntax(NULL, LOG_ERR, tmp.path, line, r, "Line too long.");
+ if (r < 0)
+ return log_syntax(NULL, LOG_ERR, tmp.path, line, r, "Error while reading: %m");
+
+ line++;
+
+ if (IN_SET(buf[0], '#', '\0'))
+ continue;
+
+ const char *p = buf;
+ r = extract_first_word(&p, &field, NULL, 0);
+ if (r < 0) {
+ log_syntax(NULL, LOG_WARNING, tmp.path, line, r, "Failed to parse, ignoring line: %m");
+ continue;
+ }
+ if (r == 0) {
+ log_syntax(NULL, LOG_WARNING, tmp.path, line, 0, "Bad syntax, ignoring line.");
+ continue;
+ }
+
+ if (isempty(p)) {
+ /* Some fields can reasonably have an empty value. In other cases warn. */
+ if (!STR_IN_SET(field, "options", "devicetree-overlay"))
+ log_syntax(NULL, LOG_WARNING, tmp.path, line, 0, "Field '%s' without value, ignoring line.", field);
+
+ continue;
+ }
+
+ if (streq(field, "title"))
+ r = free_and_strdup(&tmp.title, p);
+ else if (streq(field, "sort-key"))
+ r = free_and_strdup(&tmp.sort_key, p);
+ else if (streq(field, "version"))
+ r = free_and_strdup(&tmp.version, p);
+ else if (streq(field, "machine-id"))
+ r = free_and_strdup(&tmp.machine_id, p);
+ else if (streq(field, "architecture"))
+ r = free_and_strdup(&tmp.architecture, p);
+ else if (streq(field, "options"))
+ r = strv_extend(&tmp.options, p);
+ else if (streq(field, "linux"))
+ r = parse_path_one(tmp.path, line, field, &tmp.kernel, p);
+ else if (streq(field, "efi"))
+ r = parse_path_one(tmp.path, line, field, &tmp.efi, p);
+ else if (streq(field, "initrd"))
+ r = parse_path_strv(tmp.path, line, field, &tmp.initrd, p);
+ else if (streq(field, "devicetree"))
+ r = parse_path_one(tmp.path, line, field, &tmp.device_tree, p);
+ else if (streq(field, "devicetree-overlay"))
+ r = parse_path_many(tmp.path, line, field, &tmp.device_tree_overlay, p);
+ else {
+ log_syntax(NULL, LOG_WARNING, tmp.path, line, 0, "Unknown line '%s', ignoring.", field);
+ continue;
+ }
+ if (r < 0)
+ return log_syntax(NULL, LOG_ERR, tmp.path, line, r, "Error while parsing: %m");
+ }
+
+ *entry = TAKE_STRUCT(tmp);
+ return 0;
+}
+
+int boot_config_load_type1(
+ BootConfig *config,
+ FILE *f,
+ const char *root,
+ const char *dir,
+ const char *fname) {
+ int r;
+
+ assert(config);
+ assert(f);
+ assert(root);
+ assert(dir);
+ assert(fname);
+
+ if (!GREEDY_REALLOC0(config->entries, config->n_entries + 1))
+ return log_oom();
+
+ r = boot_entry_load_type1(f, root, dir, fname, config->entries + config->n_entries);
+ if (r < 0)
+ return r;
+
+ config->n_entries++;
+ return 0;
+}
+
+void boot_config_free(BootConfig *config) {
+ assert(config);
+
+ free(config->default_pattern);
+ free(config->timeout);
+ free(config->editor);
+ free(config->auto_entries);
+ free(config->auto_firmware);
+ free(config->console_mode);
+ free(config->beep);
+
+ free(config->entry_oneshot);
+ free(config->entry_default);
+ free(config->entry_selected);
+
+ for (size_t i = 0; i < config->n_entries; i++)
+ boot_entry_free(config->entries + i);
+ free(config->entries);
+
+ set_free(config->inodes_seen);
+}
+
+int boot_loader_read_conf(BootConfig *config, FILE *file, const char *path) {
+ unsigned line = 1;
+ int r;
+
+ assert(config);
+ assert(file);
+ assert(path);
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL, *field = NULL;
+
+ r = read_stripped_line(file, LONG_LINE_MAX, &buf);
+ if (r == 0)
+ break;
+ if (r == -ENOBUFS)
+ return log_syntax(NULL, LOG_ERR, path, line, r, "Line too long.");
+ if (r < 0)
+ return log_syntax(NULL, LOG_ERR, path, line, r, "Error while reading: %m");
+
+ line++;
+
+ if (IN_SET(buf[0], '#', '\0'))
+ continue;
+
+ const char *p = buf;
+ r = extract_first_word(&p, &field, NULL, 0);
+ if (r < 0) {
+ log_syntax(NULL, LOG_WARNING, path, line, r, "Failed to parse, ignoring line: %m");
+ continue;
+ }
+ if (r == 0) {
+ log_syntax(NULL, LOG_WARNING, path, line, 0, "Bad syntax, ignoring line.");
+ continue;
+ }
+ if (isempty(p)) {
+ log_syntax(NULL, LOG_WARNING, path, line, 0, "Field '%s' without value, ignoring line.", field);
+ continue;
+ }
+
+ if (streq(field, "default"))
+ r = free_and_strdup(&config->default_pattern, p);
+ else if (streq(field, "timeout"))
+ r = free_and_strdup(&config->timeout, p);
+ else if (streq(field, "editor"))
+ r = free_and_strdup(&config->editor, p);
+ else if (streq(field, "auto-entries"))
+ r = free_and_strdup(&config->auto_entries, p);
+ else if (streq(field, "auto-firmware"))
+ r = free_and_strdup(&config->auto_firmware, p);
+ else if (streq(field, "console-mode"))
+ r = free_and_strdup(&config->console_mode, p);
+ else if (streq(field, "random-seed-mode"))
+ log_syntax(NULL, LOG_WARNING, path, line, 0, "'random-seed-mode' has been deprecated, ignoring.");
+ else if (streq(field, "beep"))
+ r = free_and_strdup(&config->beep, p);
+ else {
+ log_syntax(NULL, LOG_WARNING, path, line, 0, "Unknown line '%s', ignoring.", field);
+ continue;
+ }
+ if (r < 0)
+ return log_syntax(NULL, LOG_ERR, path, line, r, "Error while parsing: %m");
+ }
+
+ return 1;
+}
+
+static int boot_loader_read_conf_path(BootConfig *config, const char *root, const char *path) {
+ _cleanup_free_ char *full = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(config);
+ assert(path);
+
+ r = chase_and_fopen_unlocked(path, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, "re", &full, &f);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to open '%s/%s': %m", root, path);
+
+ return boot_loader_read_conf(config, f, full);
+}
+
+static int boot_entry_compare(const BootEntry *a, const BootEntry *b) {
+ int r;
+
+ assert(a);
+ assert(b);
+
+ r = CMP(!a->sort_key, !b->sort_key);
+ if (r != 0)
+ return r;
+
+ if (a->sort_key && b->sort_key) {
+ r = strcmp(a->sort_key, b->sort_key);
+ if (r != 0)
+ return r;
+
+ r = strcmp_ptr(a->machine_id, b->machine_id);
+ if (r != 0)
+ return r;
+
+ r = -strverscmp_improved(a->version, b->version);
+ if (r != 0)
+ return r;
+ }
+
+ return -strverscmp_improved(a->id, b->id);
+}
+
+static int config_check_inode_relevant_and_unseen(BootConfig *config, int fd, const char *fname) {
+ _cleanup_free_ char *d = NULL;
+ struct stat st;
+
+ assert(config);
+ assert(fd >= 0);
+ assert(fname);
+
+ /* So, here's the thing: because of the mess around /efi/ vs. /boot/ vs. /boot/efi/ it might be that
+ * people have these dirs, or subdirs of them symlinked or bind mounted, and we might end up
+ * iterating though some dirs multiple times. Let's thus rather be safe than sorry, and track the
+ * inodes we already processed: let's ignore inodes we have seen already. This should be robust
+ * against any form of symlinking or bind mounting, and effectively suppress any such duplicates. */
+
+ if (fstat(fd, &st) < 0)
+ return log_error_errno(errno, "Failed to stat('%s'): %m", fname);
+ if (!S_ISREG(st.st_mode)) {
+ log_debug("File '%s' is not a regular file, ignoring.", fname);
+ return false;
+ }
+
+ if (set_contains(config->inodes_seen, &st)) {
+ log_debug("Inode '%s' already seen before, ignoring.", fname);
+ return false;
+ }
+
+ d = memdup(&st, sizeof(st));
+ if (!d)
+ return log_oom();
+
+ if (set_ensure_consume(&config->inodes_seen, &inode_hash_ops, TAKE_PTR(d)) < 0)
+ return log_oom();
+
+ return true;
+}
+
+static int boot_entries_find_type1(
+ BootConfig *config,
+ const char *root,
+ const char *dir) {
+
+ _cleanup_free_ DirectoryEntries *dentries = NULL;
+ _cleanup_free_ char *full = NULL;
+ _cleanup_close_ int dir_fd = -EBADF;
+ int r;
+
+ assert(config);
+ assert(root);
+ assert(dir);
+
+ dir_fd = chase_and_open(dir, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, O_DIRECTORY|O_CLOEXEC, &full);
+ if (dir_fd == -ENOENT)
+ return 0;
+ if (dir_fd < 0)
+ return log_error_errno(dir_fd, "Failed to open '%s/%s': %m", root, dir);
+
+ r = readdir_all(dir_fd, RECURSE_DIR_IGNORE_DOT, &dentries);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read directory '%s': %m", full);
+
+ for (size_t i = 0; i < dentries->n_entries; i++) {
+ const struct dirent *de = dentries->entries[i];
+ _cleanup_fclose_ FILE *f = NULL;
+
+ if (!dirent_is_file(de))
+ continue;
+
+ if (!endswith_no_case(de->d_name, ".conf"))
+ continue;
+
+ r = xfopenat(dir_fd, de->d_name, "re", O_NOFOLLOW|O_NOCTTY, &f);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to open %s/%s, ignoring: %m", full, de->d_name);
+ continue;
+ }
+
+ r = config_check_inode_relevant_and_unseen(config, fileno(f), de->d_name);
+ if (r < 0)
+ return r;
+ if (r == 0) /* inode already seen or otherwise not relevant */
+ continue;
+
+ r = boot_config_load_type1(config, f, root, full, de->d_name);
+ if (r == -ENOMEM) /* ignore all other errors */
+ return r;
+ }
+
+ return 0;
+}
+
+static int boot_entry_load_unified(
+ const char *root,
+ const char *path,
+ const char *osrelease,
+ const char *cmdline,
+ BootEntry *ret) {
+
+ _cleanup_free_ char *fname = NULL, *os_pretty_name = NULL, *os_image_id = NULL, *os_name = NULL, *os_id = NULL,
+ *os_image_version = NULL, *os_version = NULL, *os_version_id = NULL, *os_build_id = NULL;
+ _cleanup_(boot_entry_free) BootEntry tmp = BOOT_ENTRY_INIT(BOOT_ENTRY_UNIFIED);
+ const char *k, *good_name, *good_version, *good_sort_key;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(root);
+ assert(path);
+ assert(osrelease);
+
+ k = path_startswith(path, root);
+ if (!k)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Path is not below root: %s", path);
+
+ f = fmemopen_unlocked((void*) osrelease, strlen(osrelease), "r");
+ if (!f)
+ return log_error_errno(errno, "Failed to open os-release buffer: %m");
+
+ r = parse_env_file(f, "os-release",
+ "PRETTY_NAME", &os_pretty_name,
+ "IMAGE_ID", &os_image_id,
+ "NAME", &os_name,
+ "ID", &os_id,
+ "IMAGE_VERSION", &os_image_version,
+ "VERSION", &os_version,
+ "VERSION_ID", &os_version_id,
+ "BUILD_ID", &os_build_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse os-release data from unified kernel image %s: %m", path);
+
+ if (!bootspec_pick_name_version_sort_key(
+ os_pretty_name,
+ os_image_id,
+ os_name,
+ os_id,
+ os_image_version,
+ os_version,
+ os_version_id,
+ os_build_id,
+ &good_name,
+ &good_version,
+ &good_sort_key))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Missing fields in os-release data from unified kernel image %s, refusing.", path);
+
+ r = path_extract_filename(path, &fname);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract file name from '%s': %m", path);
+
+ r = boot_filename_extract_tries(fname, &tmp.id, &tmp.tries_left, &tmp.tries_done);
+ if (r < 0)
+ return r;
+
+ if (!efi_loader_entry_name_valid(tmp.id))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid loader entry name: %s", tmp.id);
+
+ if (os_id && os_version_id) {
+ tmp.id_old = strjoin(os_id, "-", os_version_id);
+ if (!tmp.id_old)
+ return log_oom();
+ }
+
+ tmp.path = strdup(path);
+ if (!tmp.path)
+ return log_oom();
+
+ tmp.root = strdup(root);
+ if (!tmp.root)
+ return log_oom();
+
+ tmp.kernel = path_make_absolute(k, "/");
+ if (!tmp.kernel)
+ return log_oom();
+
+ tmp.options = strv_new(skip_leading_chars(cmdline, WHITESPACE));
+ if (!tmp.options)
+ return log_oom();
+
+ delete_trailing_chars(tmp.options[0], WHITESPACE);
+
+ tmp.title = strdup(good_name);
+ if (!tmp.title)
+ return log_oom();
+
+ if (good_sort_key) {
+ tmp.sort_key = strdup(good_sort_key);
+ if (!tmp.sort_key)
+ return log_oom();
+ }
+
+ if (good_version) {
+ tmp.version = strdup(good_version);
+ if (!tmp.version)
+ return log_oom();
+ }
+
+ *ret = TAKE_STRUCT(tmp);
+ return 0;
+}
+
+/* Maximum PE section we are willing to load (Note that sections we are not interested in may be larger, but
+ * the ones we do care about and we are willing to load into memory have this size limit.) */
+#define PE_SECTION_SIZE_MAX (4U*1024U*1024U)
+
+static int find_sections(
+ int fd,
+ const char *path,
+ char **ret_osrelease,
+ char **ret_cmdline) {
+
+ _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL;
+ _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL;
+ _cleanup_free_ char *osrel = NULL, *cmdline = NULL;
+ _cleanup_free_ PeHeader *pe_header = NULL;
+ int r;
+
+ assert(fd >= 0);
+ assert(path);
+
+ r = pe_load_headers(fd, &dos_header, &pe_header);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse PE file '%s': %m", path);
+
+ r = pe_load_sections(fd, dos_header, pe_header, &sections);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse PE sections of '%s': %m", path);
+
+ if (!pe_is_uki(pe_header, sections))
+ return log_warning_errno(SYNTHETIC_ERRNO(EBADMSG), "Parsed PE file '%s' is not a UKI.", path);
+
+ r = pe_read_section_data(fd, pe_header, sections, ".osrel", PE_SECTION_SIZE_MAX, (void**) &osrel, NULL);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read .osrel section of '%s': %m", path);
+
+ r = pe_read_section_data(fd, pe_header, sections, ".cmdline", PE_SECTION_SIZE_MAX, (void**) &cmdline, NULL);
+ if (r < 0 && r != -ENXIO) /* cmdline is optional */
+ return log_warning_errno(r, "Failed to read .cmdline section of '%s': %m", path);
+
+ if (ret_osrelease)
+ *ret_osrelease = TAKE_PTR(osrel);
+ if (ret_cmdline)
+ *ret_cmdline = TAKE_PTR(cmdline);
+
+ return 0;
+}
+
+static int boot_entries_find_unified(
+ BootConfig *config,
+ const char *root,
+ const char *dir) {
+
+ _cleanup_closedir_ DIR *d = NULL;
+ _cleanup_free_ char *full = NULL;
+ int r;
+
+ assert(config);
+ assert(dir);
+
+ r = chase_and_opendir(dir, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, &full, &d);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to open '%s/%s': %m", root, dir);
+
+ FOREACH_DIRENT(de, d, return log_error_errno(errno, "Failed to read %s: %m", full)) {
+ _cleanup_free_ char *j = NULL, *osrelease = NULL, *cmdline = NULL;
+ _cleanup_close_ int fd = -EBADF;
+
+ if (!dirent_is_file(de))
+ continue;
+
+ if (!endswith_no_case(de->d_name, ".efi"))
+ continue;
+
+ if (!GREEDY_REALLOC0(config->entries, config->n_entries + 1))
+ return log_oom();
+
+ fd = openat(dirfd(d), de->d_name, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOFOLLOW|O_NOCTTY);
+ if (fd < 0) {
+ log_warning_errno(errno, "Failed to open %s/%s, ignoring: %m", full, de->d_name);
+ continue;
+ }
+
+ r = config_check_inode_relevant_and_unseen(config, fd, de->d_name);
+ if (r < 0)
+ return r;
+ if (r == 0) /* inode already seen or otherwise not relevant */
+ continue;
+
+ j = path_join(full, de->d_name);
+ if (!j)
+ return log_oom();
+
+ if (find_sections(fd, j, &osrelease, &cmdline) < 0)
+ continue;
+
+ r = boot_entry_load_unified(root, j, osrelease, cmdline, config->entries + config->n_entries);
+ if (r < 0)
+ continue;
+
+ config->n_entries++;
+ }
+
+ return 0;
+}
+
+static bool find_nonunique(const BootEntry *entries, size_t n_entries, bool arr[]) {
+ bool non_unique = false;
+
+ assert(entries || n_entries == 0);
+ assert(arr || n_entries == 0);
+
+ for (size_t i = 0; i < n_entries; i++)
+ arr[i] = false;
+
+ for (size_t i = 0; i < n_entries; i++)
+ for (size_t j = 0; j < n_entries; j++)
+ if (i != j && streq(boot_entry_title(entries + i),
+ boot_entry_title(entries + j)))
+ non_unique = arr[i] = arr[j] = true;
+
+ return non_unique;
+}
+
+static int boot_entries_uniquify(BootEntry *entries, size_t n_entries) {
+ _cleanup_free_ bool *arr = NULL;
+ char *s;
+
+ assert(entries || n_entries == 0);
+
+ if (n_entries == 0)
+ return 0;
+
+ arr = new(bool, n_entries);
+ if (!arr)
+ return -ENOMEM;
+
+ /* Find _all_ non-unique titles */
+ if (!find_nonunique(entries, n_entries, arr))
+ return 0;
+
+ /* Add version to non-unique titles */
+ for (size_t i = 0; i < n_entries; i++)
+ if (arr[i] && entries[i].version) {
+ if (asprintf(&s, "%s (%s)", boot_entry_title(entries + i), entries[i].version) < 0)
+ return -ENOMEM;
+
+ free_and_replace(entries[i].show_title, s);
+ }
+
+ if (!find_nonunique(entries, n_entries, arr))
+ return 0;
+
+ /* Add machine-id to non-unique titles */
+ for (size_t i = 0; i < n_entries; i++)
+ if (arr[i] && entries[i].machine_id) {
+ if (asprintf(&s, "%s (%s)", boot_entry_title(entries + i), entries[i].machine_id) < 0)
+ return -ENOMEM;
+
+ free_and_replace(entries[i].show_title, s);
+ }
+
+ if (!find_nonunique(entries, n_entries, arr))
+ return 0;
+
+ /* Add file name to non-unique titles */
+ for (size_t i = 0; i < n_entries; i++)
+ if (arr[i]) {
+ if (asprintf(&s, "%s (%s)", boot_entry_title(entries + i), entries[i].id) < 0)
+ return -ENOMEM;
+
+ free_and_replace(entries[i].show_title, s);
+ }
+
+ return 0;
+}
+
+static int boot_config_find(const BootConfig *config, const char *id) {
+ assert(config);
+
+ if (!id)
+ return -1;
+
+ if (id[0] == '@') {
+ if (!strcaseeq(id, "@saved"))
+ return -1;
+ if (!config->entry_selected)
+ return -1;
+ id = config->entry_selected;
+ }
+
+ for (size_t i = 0; i < config->n_entries; i++)
+ if (fnmatch(id, config->entries[i].id, FNM_CASEFOLD) == 0)
+ return i;
+
+ return -1;
+}
+
+static int boot_entries_select_default(const BootConfig *config) {
+ int i;
+
+ assert(config);
+ assert(config->entries || config->n_entries == 0);
+
+ if (config->n_entries == 0) {
+ log_debug("Found no default boot entry :(");
+ return -1; /* -1 means "no default" */
+ }
+
+ if (config->entry_oneshot) {
+ i = boot_config_find(config, config->entry_oneshot);
+ if (i >= 0) {
+ log_debug("Found default: id \"%s\" is matched by LoaderEntryOneShot",
+ config->entries[i].id);
+ return i;
+ }
+ }
+
+ if (config->entry_default) {
+ i = boot_config_find(config, config->entry_default);
+ if (i >= 0) {
+ log_debug("Found default: id \"%s\" is matched by LoaderEntryDefault",
+ config->entries[i].id);
+ return i;
+ }
+ }
+
+ if (config->default_pattern) {
+ i = boot_config_find(config, config->default_pattern);
+ if (i >= 0) {
+ log_debug("Found default: id \"%s\" is matched by pattern \"%s\"",
+ config->entries[i].id, config->default_pattern);
+ return i;
+ }
+ }
+
+ log_debug("Found default: first entry \"%s\"", config->entries[0].id);
+ return 0;
+}
+
+static int boot_entries_select_selected(const BootConfig *config) {
+ assert(config);
+ assert(config->entries || config->n_entries == 0);
+
+ if (!config->entry_selected || config->n_entries == 0)
+ return -1;
+
+ return boot_config_find(config, config->entry_selected);
+}
+
+static int boot_load_efi_entry_pointers(BootConfig *config, bool skip_efivars) {
+ int r;
+
+ assert(config);
+
+ if (skip_efivars || !is_efi_boot())
+ return 0;
+
+ /* Loads the three "pointers" to boot loader entries from their EFI variables */
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntryOneShot), &config->entry_oneshot);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0 && !IN_SET(r, -ENOENT, -ENODATA))
+ log_warning_errno(r, "Failed to read EFI variable \"LoaderEntryOneShot\", ignoring: %m");
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntryDefault), &config->entry_default);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0 && !IN_SET(r, -ENOENT, -ENODATA))
+ log_warning_errno(r, "Failed to read EFI variable \"LoaderEntryDefault\", ignoring: %m");
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntrySelected), &config->entry_selected);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0 && !IN_SET(r, -ENOENT, -ENODATA))
+ log_warning_errno(r, "Failed to read EFI variable \"LoaderEntrySelected\", ignoring: %m");
+
+ return 1;
+}
+
+int boot_config_select_special_entries(BootConfig *config, bool skip_efivars) {
+ int r;
+
+ assert(config);
+
+ r = boot_load_efi_entry_pointers(config, skip_efivars);
+ if (r < 0)
+ return r;
+
+ config->default_entry = boot_entries_select_default(config);
+ config->selected_entry = boot_entries_select_selected(config);
+
+ return 0;
+}
+
+int boot_config_finalize(BootConfig *config) {
+ int r;
+
+ typesafe_qsort(config->entries, config->n_entries, boot_entry_compare);
+
+ r = boot_entries_uniquify(config->entries, config->n_entries);
+ if (r < 0)
+ return log_error_errno(r, "Failed to uniquify boot entries: %m");
+
+ return 0;
+}
+
+int boot_config_load(
+ BootConfig *config,
+ const char *esp_path,
+ const char *xbootldr_path) {
+
+ int r;
+
+ assert(config);
+
+ if (esp_path) {
+ r = boot_loader_read_conf_path(config, esp_path, "/loader/loader.conf");
+ if (r < 0)
+ return r;
+
+ r = boot_entries_find_type1(config, esp_path, "/loader/entries");
+ if (r < 0)
+ return r;
+
+ r = boot_entries_find_unified(config, esp_path, "/EFI/Linux/");
+ if (r < 0)
+ return r;
+ }
+
+ if (xbootldr_path) {
+ r = boot_entries_find_type1(config, xbootldr_path, "/loader/entries");
+ if (r < 0)
+ return r;
+
+ r = boot_entries_find_unified(config, xbootldr_path, "/EFI/Linux/");
+ if (r < 0)
+ return r;
+ }
+
+ return boot_config_finalize(config);
+}
+
+int boot_config_load_auto(
+ BootConfig *config,
+ const char *override_esp_path,
+ const char *override_xbootldr_path) {
+
+ _cleanup_free_ char *esp_where = NULL, *xbootldr_where = NULL;
+ dev_t esp_devid = 0, xbootldr_devid = 0;
+ int r;
+
+ assert(config);
+
+ /* This function is similar to boot_entries_load_config(), however we automatically search for the
+ * ESP and the XBOOTLDR partition unless it is explicitly specified. Also, if the user did not pass
+ * an ESP or XBOOTLDR path directly, let's see if /run/boot-loader-entries/ exists. If so, let's
+ * read data from there, as if it was an ESP (i.e. loading both entries and loader.conf data from
+ * it). This allows other boot loaders to pass boot loader entry information to our tools if they
+ * want to. */
+
+ if (!override_esp_path && !override_xbootldr_path) {
+ if (access("/run/boot-loader-entries/", F_OK) >= 0)
+ return boot_config_load(config, "/run/boot-loader-entries/", NULL);
+
+ if (errno != ENOENT)
+ return log_error_errno(errno,
+ "Failed to determine whether /run/boot-loader-entries/ exists: %m");
+ }
+
+ r = find_esp_and_warn(NULL, override_esp_path, /* unprivileged_mode= */ false, &esp_where, NULL, NULL, NULL, NULL, &esp_devid);
+ if (r < 0) /* we don't log about ENOKEY here, but propagate it, leaving it to the caller to log */
+ return r;
+
+ r = find_xbootldr_and_warn(NULL, override_xbootldr_path, /* unprivileged_mode= */ false, &xbootldr_where, NULL, &xbootldr_devid);
+ if (r < 0 && r != -ENOKEY)
+ return r; /* It's fine if the XBOOTLDR partition doesn't exist, hence we ignore ENOKEY here */
+
+ /* If both paths actually refer to the same inode, suppress the xbootldr path */
+ if (esp_where && xbootldr_where && devnum_set_and_equal(esp_devid, xbootldr_devid))
+ xbootldr_where = mfree(xbootldr_where);
+
+ return boot_config_load(config, esp_where, xbootldr_where);
+}
+
+int boot_config_augment_from_loader(
+ BootConfig *config,
+ char **found_by_loader,
+ bool only_auto) {
+
+ static const char *const title_table[] = {
+ /* Pretty names for a few well-known automatically discovered entries. */
+ "auto-osx", "macOS",
+ "auto-windows", "Windows Boot Manager",
+ "auto-efi-shell", "EFI Shell",
+ "auto-efi-default", "EFI Default Loader",
+ "auto-poweroff", "Power Off The System",
+ "auto-reboot", "Reboot The System",
+ "auto-reboot-to-firmware-setup", "Reboot Into Firmware Interface",
+ NULL,
+ };
+
+ assert(config);
+
+ /* Let's add the entries discovered by the boot loader to the end of our list, unless they are
+ * already included there. */
+
+ STRV_FOREACH(i, found_by_loader) {
+ BootEntry *existing;
+ _cleanup_free_ char *c = NULL, *t = NULL, *p = NULL;
+
+ existing = boot_config_find_entry(config, *i);
+ if (existing) {
+ existing->reported_by_loader = true;
+ continue;
+ }
+
+ if (only_auto && !startswith(*i, "auto-"))
+ continue;
+
+ c = strdup(*i);
+ if (!c)
+ return log_oom();
+
+ STRV_FOREACH_PAIR(a, b, title_table)
+ if (streq(*a, *i)) {
+ t = strdup(*b);
+ if (!t)
+ return log_oom();
+ break;
+ }
+
+ p = strdup(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderEntries)));
+ if (!p)
+ return log_oom();
+
+ if (!GREEDY_REALLOC0(config->entries, config->n_entries + 1))
+ return log_oom();
+
+ config->entries[config->n_entries++] = (BootEntry) {
+ .type = startswith(*i, "auto-") ? BOOT_ENTRY_LOADER_AUTO : BOOT_ENTRY_LOADER,
+ .id = TAKE_PTR(c),
+ .title = TAKE_PTR(t),
+ .path = TAKE_PTR(p),
+ .reported_by_loader = true,
+ .tries_left = UINT_MAX,
+ .tries_done = UINT_MAX,
+ };
+ }
+
+ return 0;
+}
+
+BootEntry* boot_config_find_entry(BootConfig *config, const char *id) {
+ assert(config);
+ assert(id);
+
+ for (size_t j = 0; j < config->n_entries; j++)
+ if (strcaseeq_ptr(config->entries[j].id, id) ||
+ strcaseeq_ptr(config->entries[j].id_old, id))
+ return config->entries + j;
+
+ return NULL;
+}
+
+static void boot_entry_file_list(
+ const char *field,
+ const char *root,
+ const char *p,
+ int *ret_status) {
+
+ assert(p);
+ assert(ret_status);
+
+ int status = chase_and_access(p, root, CHASE_PREFIX_ROOT|CHASE_PROHIBIT_SYMLINKS, F_OK, NULL);
+
+ /* Note that this shows two '/' between the root and the file. This is intentional to highlight (in
+ * the absence of color support) to the user that the boot loader is only interested in the second
+ * part of the file. */
+ printf("%13s%s %s%s/%s", strempty(field), field ? ":" : " ", ansi_grey(), root, ansi_normal());
+
+ if (status < 0) {
+ errno = -status;
+ printf("%s%s%s (%m)\n", ansi_highlight_red(), p, ansi_normal());
+ } else
+ printf("%s\n", p);
+
+ if (*ret_status == 0 && status < 0)
+ *ret_status = status;
+}
+
+int show_boot_entry(
+ const BootEntry *e,
+ bool show_as_default,
+ bool show_as_selected,
+ bool show_reported) {
+
+ int status = 0;
+
+ /* Returns 0 on success, negative on processing error, and positive if something is wrong with the
+ boot entry itself. */
+
+ assert(e);
+
+ printf(" type: %s\n",
+ boot_entry_type_to_string(e->type));
+
+ printf(" title: %s%s%s",
+ ansi_highlight(), boot_entry_title(e), ansi_normal());
+
+ if (show_as_default)
+ printf(" %s(default)%s",
+ ansi_highlight_green(), ansi_normal());
+
+ if (show_as_selected)
+ printf(" %s(selected)%s",
+ ansi_highlight_magenta(), ansi_normal());
+
+ if (show_reported) {
+ if (e->type == BOOT_ENTRY_LOADER)
+ printf(" %s(reported/absent)%s",
+ ansi_highlight_red(), ansi_normal());
+ else if (!e->reported_by_loader && e->type != BOOT_ENTRY_LOADER_AUTO)
+ printf(" %s(not reported/new)%s",
+ ansi_highlight_green(), ansi_normal());
+ }
+
+ putchar('\n');
+
+ if (e->id)
+ printf(" id: %s\n", e->id);
+ if (e->path) {
+ _cleanup_free_ char *text = NULL, *link = NULL;
+
+ const char *p = e->root ? path_startswith(e->path, e->root) : NULL;
+ if (p) {
+ text = strjoin(ansi_grey(), e->root, "/", ansi_normal(), "/", p);
+ if (!text)
+ return log_oom();
+ }
+
+ /* Let's urlify the link to make it easy to view in an editor, but only if it is a text
+ * file. Unified images are binary ELFs, and EFI variables are not pure text either. */
+ if (e->type == BOOT_ENTRY_CONF)
+ (void) terminal_urlify_path(e->path, text, &link);
+
+ printf(" source: %s\n", link ?: text ?: e->path);
+ }
+ if (e->tries_left != UINT_MAX) {
+ printf(" tries: %u left", e->tries_left);
+
+ if (e->tries_done != UINT_MAX)
+ printf("; %u done\n", e->tries_done);
+ else
+ printf("\n");
+ }
+
+ if (e->sort_key)
+ printf(" sort-key: %s\n", e->sort_key);
+ if (e->version)
+ printf(" version: %s\n", e->version);
+ if (e->machine_id)
+ printf(" machine-id: %s\n", e->machine_id);
+ if (e->architecture)
+ printf(" architecture: %s\n", e->architecture);
+ if (e->kernel)
+ boot_entry_file_list("linux", e->root, e->kernel, &status);
+ if (e->efi)
+ boot_entry_file_list("efi", e->root, e->efi, &status);
+
+ STRV_FOREACH(s, e->initrd)
+ boot_entry_file_list(s == e->initrd ? "initrd" : NULL,
+ e->root,
+ *s,
+ &status);
+
+ if (!strv_isempty(e->options)) {
+ _cleanup_free_ char *t = NULL, *t2 = NULL;
+ _cleanup_strv_free_ char **ts = NULL;
+
+ t = strv_join(e->options, " ");
+ if (!t)
+ return log_oom();
+
+ ts = strv_split_newlines(t);
+ if (!ts)
+ return log_oom();
+
+ t2 = strv_join(ts, "\n ");
+ if (!t2)
+ return log_oom();
+
+ printf(" options: %s\n", t2);
+ }
+
+ if (e->device_tree)
+ boot_entry_file_list("devicetree", e->root, e->device_tree, &status);
+
+ STRV_FOREACH(s, e->device_tree_overlay)
+ boot_entry_file_list(s == e->device_tree_overlay ? "devicetree-overlay" : NULL,
+ e->root,
+ *s,
+ &status);
+
+ return -status;
+}
+
+int show_boot_entries(const BootConfig *config, JsonFormatFlags json_format) {
+ int r;
+
+ assert(config);
+
+ if (!FLAGS_SET(json_format, JSON_FORMAT_OFF)) {
+ _cleanup_(json_variant_unrefp) JsonVariant *array = NULL;
+
+ for (size_t i = 0; i < config->n_entries; i++) {
+ _cleanup_free_ char *opts = NULL;
+ const BootEntry *e = config->entries + i;
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+
+ if (!strv_isempty(e->options)) {
+ opts = strv_join(e->options, " ");
+ if (!opts)
+ return log_oom();
+ }
+
+ r = json_variant_merge_objectb(
+ &v, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("type", JSON_BUILD_STRING(boot_entry_type_json_to_string(e->type))),
+ JSON_BUILD_PAIR_CONDITION(e->id, "id", JSON_BUILD_STRING(e->id)),
+ JSON_BUILD_PAIR_CONDITION(e->path, "path", JSON_BUILD_STRING(e->path)),
+ JSON_BUILD_PAIR_CONDITION(e->root, "root", JSON_BUILD_STRING(e->root)),
+ JSON_BUILD_PAIR_CONDITION(e->title, "title", JSON_BUILD_STRING(e->title)),
+ JSON_BUILD_PAIR_CONDITION(boot_entry_title(e), "showTitle", JSON_BUILD_STRING(boot_entry_title(e))),
+ JSON_BUILD_PAIR_CONDITION(e->sort_key, "sortKey", JSON_BUILD_STRING(e->sort_key)),
+ JSON_BUILD_PAIR_CONDITION(e->version, "version", JSON_BUILD_STRING(e->version)),
+ JSON_BUILD_PAIR_CONDITION(e->machine_id, "machineId", JSON_BUILD_STRING(e->machine_id)),
+ JSON_BUILD_PAIR_CONDITION(e->architecture, "architecture", JSON_BUILD_STRING(e->architecture)),
+ JSON_BUILD_PAIR_CONDITION(opts, "options", JSON_BUILD_STRING(opts)),
+ JSON_BUILD_PAIR_CONDITION(e->kernel, "linux", JSON_BUILD_STRING(e->kernel)),
+ JSON_BUILD_PAIR_CONDITION(e->efi, "efi", JSON_BUILD_STRING(e->efi)),
+ JSON_BUILD_PAIR_CONDITION(!strv_isempty(e->initrd), "initrd", JSON_BUILD_STRV(e->initrd)),
+ JSON_BUILD_PAIR_CONDITION(e->device_tree, "devicetree", JSON_BUILD_STRING(e->device_tree)),
+ JSON_BUILD_PAIR_CONDITION(!strv_isempty(e->device_tree_overlay), "devicetreeOverlay", JSON_BUILD_STRV(e->device_tree_overlay))));
+ if (r < 0)
+ return log_oom();
+
+ /* Sanitizers (only memory sanitizer?) do not like function call with too many
+ * arguments and trigger false positive warnings. Let's not add too many json objects
+ * at once. */
+ r = json_variant_merge_objectb(
+ &v, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("isReported", JSON_BUILD_BOOLEAN(e->reported_by_loader)),
+ JSON_BUILD_PAIR_CONDITION(e->tries_left != UINT_MAX, "triesLeft", JSON_BUILD_UNSIGNED(e->tries_left)),
+ JSON_BUILD_PAIR_CONDITION(e->tries_done != UINT_MAX, "triesDone", JSON_BUILD_UNSIGNED(e->tries_done)),
+ JSON_BUILD_PAIR_CONDITION(config->default_entry >= 0, "isDefault", JSON_BUILD_BOOLEAN(i == (size_t) config->default_entry)),
+ JSON_BUILD_PAIR_CONDITION(config->selected_entry >= 0, "isSelected", JSON_BUILD_BOOLEAN(i == (size_t) config->selected_entry))));
+
+ if (r < 0)
+ return log_oom();
+
+ r = json_variant_append_array(&array, v);
+ if (r < 0)
+ return log_oom();
+ }
+
+ json_variant_dump(array, json_format | JSON_FORMAT_EMPTY_ARRAY, NULL, NULL);
+
+ } else {
+ for (size_t n = 0; n < config->n_entries; n++) {
+ r = show_boot_entry(
+ config->entries + n,
+ /* show_as_default= */ n == (size_t) config->default_entry,
+ /* show_as_selected= */ n == (size_t) config->selected_entry,
+ /* show_discovered= */ true);
+ if (r < 0)
+ return r;
+
+ if (n+1 < config->n_entries)
+ putchar('\n');
+ }
+ }
+
+ return 0;
+}
diff --git a/src/shared/bootspec.h b/src/shared/bootspec.h
new file mode 100644
index 0000000..ddd149e
--- /dev/null
+++ b/src/shared/bootspec.h
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "json.h"
+#include "set.h"
+#include "string-util.h"
+
+typedef enum BootEntryType {
+ BOOT_ENTRY_CONF, /* Boot Loader Specification Type #1 entries: *.conf files */
+ BOOT_ENTRY_UNIFIED, /* Boot Loader Specification Type #2 entries: *.efi files */
+ BOOT_ENTRY_LOADER, /* Additional entries augmented from LoaderEntries EFI variable (regular entries) */
+ BOOT_ENTRY_LOADER_AUTO, /* Additional entries augmented from LoaderEntries EFI variable (special "automatic" entries) */
+ _BOOT_ENTRY_TYPE_MAX,
+ _BOOT_ENTRY_TYPE_INVALID = -EINVAL,
+} BootEntryType;
+
+typedef struct BootEntry {
+ BootEntryType type;
+ bool reported_by_loader;
+ char *id; /* This is the file basename (including extension!) */
+ char *id_old; /* Old-style ID, for deduplication purposes. */
+ char *path; /* This is the full path to the drop-in file */
+ char *root; /* The root path in which the drop-in was found, i.e. to which 'kernel', 'efi' and 'initrd' are relative */
+ char *title;
+ char *show_title;
+ char *sort_key;
+ char *version;
+ char *machine_id;
+ char *architecture;
+ char **options;
+ char *kernel; /* linux is #defined to 1, yikes! */
+ char *efi;
+ char **initrd;
+ char *device_tree;
+ char **device_tree_overlay;
+ unsigned tries_left;
+ unsigned tries_done;
+} BootEntry;
+
+#define BOOT_ENTRY_INIT(t) \
+ { \
+ .type = (t), \
+ .tries_left = UINT_MAX, \
+ .tries_done = UINT_MAX, \
+ }
+
+typedef struct BootConfig {
+ char *default_pattern;
+ char *timeout;
+ char *editor;
+ char *auto_entries;
+ char *auto_firmware;
+ char *console_mode;
+ char *beep;
+
+ char *entry_oneshot;
+ char *entry_default;
+ char *entry_selected;
+
+ BootEntry *entries;
+ size_t n_entries;
+
+ ssize_t default_entry;
+ ssize_t selected_entry;
+
+ Set *inodes_seen;
+} BootConfig;
+
+#define BOOT_CONFIG_NULL \
+ { \
+ .default_entry = -1, \
+ .selected_entry = -1, \
+ }
+
+const char* boot_entry_type_to_string(BootEntryType);
+const char* boot_entry_type_json_to_string(BootEntryType);
+
+BootEntry* boot_config_find_entry(BootConfig *config, const char *id);
+
+static inline const BootEntry* boot_config_default_entry(const BootConfig *config) {
+ assert(config);
+
+ if (config->default_entry < 0)
+ return NULL;
+
+ assert((size_t) config->default_entry < config->n_entries);
+ return config->entries + config->default_entry;
+}
+
+void boot_config_free(BootConfig *config);
+
+int boot_loader_read_conf(BootConfig *config, FILE *file, const char *path);
+
+int boot_config_load_type1(
+ BootConfig *config,
+ FILE *f,
+ const char *root,
+ const char *dir,
+ const char *id);
+
+int boot_config_finalize(BootConfig *config);
+int boot_config_load(BootConfig *config, const char *esp_path, const char *xbootldr_path);
+int boot_config_load_auto(BootConfig *config, const char *override_esp_path, const char *override_xbootldr_path);
+int boot_config_augment_from_loader(BootConfig *config, char **list, bool only_auto);
+
+int boot_config_select_special_entries(BootConfig *config, bool skip_efivars);
+
+static inline const char* boot_entry_title(const BootEntry *entry) {
+ assert(entry);
+
+ return ASSERT_PTR(entry->show_title ?: entry->title ?: entry->id);
+}
+
+int show_boot_entry(
+ const BootEntry *e,
+ bool show_as_default,
+ bool show_as_selected,
+ bool show_reported);
+int show_boot_entries(
+ const BootConfig *config,
+ JsonFormatFlags json_format);
+
+int boot_filename_extract_tries(const char *fname, char **ret_stripped, unsigned *ret_tries_left, unsigned *ret_tries_done);
diff --git a/src/shared/bpf-compat.h b/src/shared/bpf-compat.h
new file mode 100644
index 0000000..9ccb7d8
--- /dev/null
+++ b/src/shared/bpf-compat.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* libbpf has been moving quickly.
+ * They added new symbols in the 0.x versions and shortly after removed
+ * deprecated symbols in 1.0.
+ * We only need bpf_map_create and libbpf_probe_bpf_prog_type so we work
+ * around the incompatibility here by:
+ * - declaring both symbols, and looking for either depending on the libbpf
+ * so version we found
+ * - having helpers that automatically use the appropriate version behind the
+ * new API for easy cleanup later
+ *
+ * The advantage of doing this instead of only looking for the symbols declared at
+ * compile time is that we can then load either the old or the new symbols at runtime
+ * regardless of the version we were compiled with */
+
+
+/* declare the struct for libbpf <= 0.6.0 -- it causes no harm on newer versions */
+struct bpf_map_create_opts;
+
+/* new symbols available from 0.7.0.
+ * We need the symbols here:
+ * - after bpf_map_create_opts struct has been defined for older libbpf
+ * - before the compat static inline helpers that use them.
+ * When removing this file move these back to bpf-dlopen.h */
+extern int (*sym_bpf_map_create)(enum bpf_map_type, const char *, __u32, __u32, __u32, const struct bpf_map_create_opts *);
+extern int (*sym_libbpf_probe_bpf_prog_type)(enum bpf_prog_type, const void *);
+
+/* compat symbols removed in libbpf 1.0 */
+extern int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags);
+extern bool (*sym_bpf_probe_prog_type)(enum bpf_prog_type, __u32);
+
+/* helpers to use the available variant behind new API */
+static inline int compat_bpf_map_create(enum bpf_map_type map_type,
+ const char *map_name,
+ __u32 key_size,
+ __u32 value_size,
+ __u32 max_entries,
+ const struct bpf_map_create_opts *opts) {
+ if (sym_bpf_map_create)
+ return sym_bpf_map_create(map_type, map_name, key_size,
+ value_size, max_entries, opts);
+
+ return sym_bpf_create_map(map_type, key_size, value_size, max_entries,
+ 0 /* opts->map_flags, but opts is always NULL for us so skip build dependency on the type */);
+}
+
+static inline int compat_libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) {
+ if (sym_libbpf_probe_bpf_prog_type)
+ return sym_libbpf_probe_bpf_prog_type(prog_type, opts);
+
+ return sym_bpf_probe_prog_type(prog_type, 0);
+}
diff --git a/src/shared/bpf-dlopen.c b/src/shared/bpf-dlopen.c
new file mode 100644
index 0000000..15301ae
--- /dev/null
+++ b/src/shared/bpf-dlopen.c
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dlfcn-util.h"
+#include "bpf-dlopen.h"
+#include "log.h"
+#include "strv.h"
+
+#if HAVE_LIBBPF
+
+/* libbpf changed types of function prototypes around, so we need to disable some type checking for older
+ * libbpf. We consider everything older than 0.7 too old for accurate type checks. */
+#if defined(__LIBBPF_CURRENT_VERSION_GEQ)
+#if __LIBBPF_CURRENT_VERSION_GEQ(0, 7)
+#define MODERN_LIBBPF 1
+#endif
+#endif
+#if !defined(MODERN_LIBBPF)
+#define MODERN_LIBBPF 0
+#endif
+
+struct bpf_link* (*sym_bpf_program__attach_cgroup)(const struct bpf_program *, int);
+struct bpf_link* (*sym_bpf_program__attach_lsm)(const struct bpf_program *);
+int (*sym_bpf_link__fd)(const struct bpf_link *);
+int (*sym_bpf_link__destroy)(struct bpf_link *);
+int (*sym_bpf_map__fd)(const struct bpf_map *);
+const char* (*sym_bpf_map__name)(const struct bpf_map *);
+int (*sym_bpf_map_create)(enum bpf_map_type, const char *, __u32, __u32, __u32, const struct bpf_map_create_opts *);
+int (*sym_bpf_map__set_max_entries)(struct bpf_map *, __u32);
+int (*sym_bpf_map_update_elem)(int, const void *, const void *, __u64);
+int (*sym_bpf_map_delete_elem)(int, const void *);
+int (*sym_bpf_map__set_inner_map_fd)(struct bpf_map *, int);
+int (*sym_bpf_object__open_skeleton)(struct bpf_object_skeleton *, const struct bpf_object_open_opts *);
+int (*sym_bpf_object__load_skeleton)(struct bpf_object_skeleton *);
+int (*sym_bpf_object__attach_skeleton)(struct bpf_object_skeleton *);
+void (*sym_bpf_object__detach_skeleton)(struct bpf_object_skeleton *);
+void (*sym_bpf_object__destroy_skeleton)(struct bpf_object_skeleton *);
+int (*sym_libbpf_probe_bpf_prog_type)(enum bpf_prog_type, const void *);
+const char* (*sym_bpf_program__name)(const struct bpf_program *);
+libbpf_print_fn_t (*sym_libbpf_set_print)(libbpf_print_fn_t);
+long (*sym_libbpf_get_error)(const void *);
+
+/* compat symbols removed in libbpf 1.0 */
+int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags);
+bool (*sym_bpf_probe_prog_type)(enum bpf_prog_type, __u32);
+
+_printf_(2,0)
+static int bpf_print_func(enum libbpf_print_level level, const char *fmt, va_list ap) {
+#if !LOG_TRACE
+ /* libbpf logs a lot of details at its debug level, which we don't need to see. */
+ if (level == LIBBPF_DEBUG)
+ return 0;
+#endif
+ /* All other levels are downgraded to LOG_DEBUG */
+
+ /* errno is used here, on the assumption that if the log message uses %m, errno will be set to
+ * something useful. Otherwise, it shouldn't matter, we may pass 0 or some bogus value. */
+ return log_internalv(LOG_DEBUG, errno, NULL, 0, NULL, fmt, ap);
+}
+
+int dlopen_bpf(void) {
+ void *dl;
+ int r;
+
+ DISABLE_WARNING_DEPRECATED_DECLARATIONS;
+
+ dl = dlopen("libbpf.so.1", RTLD_LAZY);
+ if (!dl) {
+ /* libbpf < 1.0.0 (we rely on 0.1.0+) provide most symbols we care about, but
+ * unfortunately not all until 0.7.0. See bpf-compat.h for more details.
+ * Once we consider we can assume 0.7+ is present we can just use the same symbol
+ * list for both files, and when we assume 1.0+ is present we can remove this dlopen */
+ dl = dlopen("libbpf.so.0", RTLD_LAZY);
+ if (!dl)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "neither libbpf.so.1 nor libbpf.so.0 are installed: %s", dlerror());
+
+ /* symbols deprecated in 1.0 we use as compat */
+ r = dlsym_many_or_warn(
+ dl, LOG_DEBUG,
+#if MODERN_LIBBPF
+ /* Don't exist anymore in new libbpf, hence cannot type check them */
+ DLSYM_ARG_FORCE(bpf_create_map),
+ DLSYM_ARG_FORCE(bpf_probe_prog_type));
+#else
+ DLSYM_ARG(bpf_create_map),
+ DLSYM_ARG(bpf_probe_prog_type));
+#endif
+ } else {
+ /* symbols available from 0.7.0 */
+ r = dlsym_many_or_warn(
+ dl, LOG_DEBUG,
+#if MODERN_LIBBPF
+ DLSYM_ARG(bpf_map_create),
+ DLSYM_ARG(libbpf_probe_bpf_prog_type)
+#else
+ /* These symbols did not exist in old libbpf, hence we cannot type check them */
+ DLSYM_ARG_FORCE(bpf_map_create),
+ DLSYM_ARG_FORCE(libbpf_probe_bpf_prog_type)
+#endif
+ );
+ }
+
+ r = dlsym_many_or_warn(
+ dl, LOG_DEBUG,
+ DLSYM_ARG(bpf_link__destroy),
+ DLSYM_ARG(bpf_link__fd),
+ DLSYM_ARG(bpf_map__fd),
+ DLSYM_ARG(bpf_map__name),
+ DLSYM_ARG(bpf_map__set_max_entries),
+ DLSYM_ARG(bpf_map_update_elem),
+ DLSYM_ARG(bpf_map_delete_elem),
+ DLSYM_ARG(bpf_map__set_inner_map_fd),
+ DLSYM_ARG(bpf_object__open_skeleton),
+ DLSYM_ARG(bpf_object__load_skeleton),
+ DLSYM_ARG(bpf_object__attach_skeleton),
+ DLSYM_ARG(bpf_object__detach_skeleton),
+ DLSYM_ARG(bpf_object__destroy_skeleton),
+#if MODERN_LIBBPF
+ DLSYM_ARG(bpf_program__attach_cgroup),
+ DLSYM_ARG(bpf_program__attach_lsm),
+#else
+ /* libbpf added a "const" to function parameters where it should not have, ignore this type incompatibility */
+ DLSYM_ARG_FORCE(bpf_program__attach_cgroup),
+ DLSYM_ARG_FORCE(bpf_program__attach_lsm),
+#endif
+ DLSYM_ARG(bpf_program__name),
+ DLSYM_ARG(libbpf_set_print),
+ DLSYM_ARG(libbpf_get_error));
+ if (r < 0)
+ return r;
+
+ /* We set the print helper unconditionally. Otherwise libbpf will emit not useful log messages. */
+ (void) sym_libbpf_set_print(bpf_print_func);
+
+ REENABLE_WARNING;
+
+ return r;
+}
+
+#else
+
+int dlopen_bpf(void) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "libbpf support is not compiled in.");
+}
+#endif
diff --git a/src/shared/bpf-dlopen.h b/src/shared/bpf-dlopen.h
new file mode 100644
index 0000000..0750abc
--- /dev/null
+++ b/src/shared/bpf-dlopen.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_LIBBPF
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf-compat.h"
+
+extern struct bpf_link* (*sym_bpf_program__attach_cgroup)(const struct bpf_program *, int);
+extern struct bpf_link* (*sym_bpf_program__attach_lsm)(const struct bpf_program *);
+extern int (*sym_bpf_link__fd)(const struct bpf_link *);
+extern int (*sym_bpf_link__destroy)(struct bpf_link *);
+extern int (*sym_bpf_map__fd)(const struct bpf_map *);
+extern const char* (*sym_bpf_map__name)(const struct bpf_map *);
+extern int (*sym_bpf_map__set_max_entries)(struct bpf_map *, __u32);
+extern int (*sym_bpf_map_update_elem)(int, const void *, const void *, __u64);
+extern int (*sym_bpf_map_delete_elem)(int, const void *);
+extern int (*sym_bpf_map__set_inner_map_fd)(struct bpf_map *, int);
+/* The *_skeleton APIs are autogenerated by bpftool, the targets can be found
+ * in ./build/src/core/bpf/socket_bind/socket-bind.skel.h */
+extern int (*sym_bpf_object__open_skeleton)(struct bpf_object_skeleton *, const struct bpf_object_open_opts *);
+extern int (*sym_bpf_object__load_skeleton)(struct bpf_object_skeleton *);
+extern int (*sym_bpf_object__attach_skeleton)(struct bpf_object_skeleton *);
+extern void (*sym_bpf_object__detach_skeleton)(struct bpf_object_skeleton *);
+extern void (*sym_bpf_object__destroy_skeleton)(struct bpf_object_skeleton *);
+extern const char* (*sym_bpf_program__name)(const struct bpf_program *);
+extern libbpf_print_fn_t (*sym_libbpf_set_print)(libbpf_print_fn_t);
+extern long (*sym_libbpf_get_error)(const void *);
+
+#endif
+
+int dlopen_bpf(void);
diff --git a/src/shared/bpf-link.c b/src/shared/bpf-link.c
new file mode 100644
index 0000000..fea49b2
--- /dev/null
+++ b/src/shared/bpf-link.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "serialize.h"
+
+bool bpf_can_link_program(struct bpf_program *prog) {
+ _cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
+
+ assert(prog);
+
+ if (dlopen_bpf() < 0)
+ return false;
+
+ /* Pass invalid cgroup fd intentionally. */
+ link = sym_bpf_program__attach_cgroup(prog, /*cgroup_fd=*/-1);
+
+ /* EBADF indicates that bpf_link is supported by kernel. */
+ return sym_libbpf_get_error(link) == -EBADF;
+}
+
+int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link) {
+ assert(key);
+
+ if (!link)
+ return -ENOENT;
+
+ if (sym_libbpf_get_error(link) != 0)
+ return -EINVAL;
+
+ return serialize_fd(f, fds, key, sym_bpf_link__fd(link));
+}
+
+struct bpf_link *bpf_link_free(struct bpf_link *link) {
+ /* If libbpf wasn't dlopen()ed, sym_bpf_link__destroy might be unresolved (NULL), so let's not try to
+ * call it if link is NULL. link might also be a non-null "error pointer", but such a value can only
+ * originate from a call to libbpf, but that means that libbpf is available, and we can let
+ * bpf_link__destroy() handle it. */
+ if (link)
+ (void) sym_bpf_link__destroy(link);
+
+ return NULL;
+}
diff --git a/src/shared/bpf-link.h b/src/shared/bpf-link.h
new file mode 100644
index 0000000..38aa080
--- /dev/null
+++ b/src/shared/bpf-link.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include <bpf/libbpf.h>
+#include <stdio.h>
+
+#include "fdset.h"
+#include "macro.h"
+
+bool bpf_can_link_program(struct bpf_program *prog);
+
+int bpf_serialize_link(FILE *f, FDSet *fds, const char *key, struct bpf_link *link);
+
+struct bpf_link *bpf_link_free(struct bpf_link *p);
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct bpf_link *, bpf_link_free);
diff --git a/src/shared/bpf-program.c b/src/shared/bpf-program.c
new file mode 100644
index 0000000..bbdd4f6
--- /dev/null
+++ b/src/shared/bpf-program.c
@@ -0,0 +1,513 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bpf-program.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+#include "path-util.h"
+#include "serialize.h"
+#include "string-table.h"
+
+static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
+ [BPF_CGROUP_INET_INGRESS] = "ingress",
+ [BPF_CGROUP_INET_EGRESS] = "egress",
+ [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
+ [BPF_CGROUP_SOCK_OPS] = "sock_ops",
+ [BPF_CGROUP_DEVICE] = "device",
+ [BPF_CGROUP_INET4_BIND] = "bind4",
+ [BPF_CGROUP_INET6_BIND] = "bind6",
+ [BPF_CGROUP_INET4_CONNECT] = "connect4",
+ [BPF_CGROUP_INET6_CONNECT] = "connect6",
+ [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
+ [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
+ [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
+ [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
+ [BPF_CGROUP_SYSCTL] = "sysctl",
+ [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
+ [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
+ [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
+ [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
+
+DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
+
+BPFProgram *bpf_program_free(BPFProgram *p) {
+ if (!p)
+ return NULL;
+ /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
+ * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
+ * programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with
+ * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
+ * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
+ * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
+ * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
+ * whenever we close the BPF fd. */
+ (void) bpf_program_cgroup_detach(p);
+
+ safe_close(p->kernel_fd);
+ free(p->prog_name);
+ free(p->instructions);
+ free(p->attached_path);
+
+ return mfree(p);
+}
+
+ /* struct bpf_prog_info info must be initialized since its value is both input and output
+ * for BPF_OBJ_GET_INFO_BY_FD syscall. */
+static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
+ union bpf_attr attr;
+
+ /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
+ * structured initialization is used.
+ * Refer to https://github.com/systemd/systemd/issues/18164
+ */
+ zero(attr);
+ attr.info.bpf_fd = prog_fd;
+ attr.info.info_len = info_len;
+ attr.info.info = PTR_TO_UINT64(info);
+
+ return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)));
+}
+
+int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) {
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ _cleanup_free_ char *name = NULL;
+
+ if (prog_name) {
+ if (strlen(prog_name) >= BPF_OBJ_NAME_LEN)
+ return -ENAMETOOLONG;
+
+ name = strdup(prog_name);
+ if (!name)
+ return -ENOMEM;
+ }
+
+ p = new(BPFProgram, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFProgram) {
+ .prog_type = prog_type,
+ .kernel_fd = -EBADF,
+ .prog_name = TAKE_PTR(name),
+ };
+
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ struct bpf_prog_info info = {};
+ int r;
+
+ assert(path);
+ assert(ret);
+
+ p = new(BPFProgram, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFProgram) {
+ .prog_type = BPF_PROG_TYPE_UNSPEC,
+ .kernel_fd = -EBADF,
+ };
+
+ r = bpf_program_load_from_bpf_fs(p, path);
+ if (r < 0)
+ return r;
+
+ r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ p->prog_type = info.type;
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+
+int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
+
+ assert(p);
+
+ if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
+ return -EBUSY;
+
+ if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
+ return -ENOMEM;
+
+ memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
+ p->n_instructions += count;
+
+ return 0;
+}
+
+int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
+ union bpf_attr attr;
+
+ assert(p);
+
+ if (p->kernel_fd >= 0) { /* make this idempotent */
+ memzero(log_buf, log_size);
+ return 0;
+ }
+
+ // FIXME: Clang doesn't 0-pad with structured initialization, causing
+ // the kernel to reject the bpf_attr as invalid. See:
+ // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
+ // Ideally it should behave like GCC, so that we can remove these workarounds.
+ zero(attr);
+ attr.prog_type = p->prog_type;
+ attr.insns = PTR_TO_UINT64(p->instructions);
+ attr.insn_cnt = p->n_instructions;
+ attr.license = PTR_TO_UINT64("GPL");
+ attr.log_buf = PTR_TO_UINT64(log_buf);
+ attr.log_level = !!log_buf;
+ attr.log_size = log_size;
+ if (p->prog_name)
+ strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1);
+
+ p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (p->kernel_fd < 0)
+ return -errno;
+
+ return 0;
+}
+
+int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
+ union bpf_attr attr;
+
+ assert(p);
+
+ if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
+ return -EBUSY;
+
+ zero(attr);
+ attr.pathname = PTR_TO_UINT64(path);
+
+ p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
+ if (p->kernel_fd < 0)
+ return -errno;
+
+ return 0;
+}
+
+int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
+ _cleanup_free_ char *copy = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ union bpf_attr attr;
+ int r;
+
+ assert(p);
+ assert(type >= 0);
+ assert(path);
+
+ if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+ return -EINVAL;
+
+ /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
+ * refuse this early. */
+ if (p->attached_path) {
+ if (!path_equal(p->attached_path, path))
+ return -EBUSY;
+ if (p->attached_type != type)
+ return -EBUSY;
+ if (p->attached_flags != flags)
+ return -EBUSY;
+
+ /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
+ * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
+ * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
+ * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
+ * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
+ * would remain in effect. */
+ if (flags != BPF_F_ALLOW_OVERRIDE)
+ return 0;
+ }
+
+ /* Ensure we have a kernel object for this. */
+ r = bpf_program_load_kernel(p, NULL, 0);
+ if (r < 0)
+ return r;
+
+ copy = strdup(path);
+ if (!copy)
+ return -ENOMEM;
+
+ fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ zero(attr);
+ attr.attach_type = type;
+ attr.target_fd = fd;
+ attr.attach_bpf_fd = p->kernel_fd;
+ attr.attach_flags = flags;
+
+ if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
+ return -errno;
+
+ free_and_replace(p->attached_path, copy);
+ p->attached_type = type;
+ p->attached_flags = flags;
+
+ return 0;
+}
+
+int bpf_program_cgroup_detach(BPFProgram *p) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(p);
+
+ if (!p->attached_path)
+ return -EUNATCH;
+
+ fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT)
+ return -errno;
+
+ /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
+ * implicitly by the removal, hence don't complain */
+
+ } else {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.attach_type = p->attached_type;
+ attr.target_fd = fd;
+ attr.attach_bpf_fd = p->kernel_fd;
+
+ if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
+ return -errno;
+ }
+
+ p->attached_path = mfree(p->attached_path);
+
+ return 0;
+}
+
+int bpf_map_new(
+ const char *name,
+ enum bpf_map_type type,
+ size_t key_size,
+ size_t value_size,
+ size_t max_entries,
+ uint32_t flags) {
+
+ union bpf_attr attr;
+ const char *n = name;
+
+ zero(attr);
+ attr.map_type = type;
+ attr.key_size = key_size;
+ attr.value_size = value_size;
+ attr.max_entries = max_entries;
+ attr.map_flags = flags;
+
+ /* The map name is primarily informational for debugging purposes, and typically too short
+ * to carry the full unit name, hence we employ a trivial lossy escaping to make it fit
+ * (truncation + only alphanumerical, "." and "_" are allowed as per
+ * https://www.kernel.org/doc/html/next/bpf/maps.html#usage-notes) */
+ for (size_t i = 0; i < sizeof(attr.map_name) - 1 && *n; i++, n++)
+ attr.map_name[i] = strchr(ALPHANUMERICAL ".", *n) ? *n : '_';
+
+ return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr)));
+}
+
+int bpf_map_update_element(int fd, const void *key, void *value) {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.map_fd = fd;
+ attr.key = PTR_TO_UINT64(key);
+ attr.value = PTR_TO_UINT64(value);
+
+ return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)));
+}
+
+int bpf_map_lookup_element(int fd, const void *key, void *value) {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.map_fd = fd;
+ attr.key = PTR_TO_UINT64(key);
+ attr.value = PTR_TO_UINT64(value);
+
+ return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)));
+}
+
+int bpf_program_pin(int prog_fd, const char *bpffs_path) {
+ union bpf_attr attr;
+
+ zero(attr);
+ attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
+ attr.bpf_fd = prog_fd;
+
+ return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
+}
+
+int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
+ struct bpf_prog_info info = {};
+ int r;
+
+ assert(ret_id);
+
+ r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
+ if (r < 0)
+ return r;
+
+ *ret_id = info.id;
+
+ return 0;
+};
+
+int bpf_program_serialize_attachment(
+ FILE *f,
+ FDSet *fds,
+ const char *key,
+ BPFProgram *p) {
+
+ _cleanup_free_ char *escaped = NULL;
+ int copy, r;
+
+ if (!p || !p->attached_path)
+ return 0;
+
+ assert(p->kernel_fd >= 0);
+
+ escaped = cescape(p->attached_path);
+ if (!escaped)
+ return -ENOMEM;
+
+ copy = fdset_put_dup(fds, p->kernel_fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
+
+ r = serialize_item_format(
+ f,
+ key,
+ "%i %s %s",
+ copy,
+ bpf_cgroup_attach_type_to_string(p->attached_type),
+ escaped);
+ if (r < 0)
+ return r;
+
+ /* After serialization, let's forget the fact that this program is attached. The attachment — if you
+ * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
+ * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
+ * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
+ * want the program to be detached while freeing things, so that the attachment can be retained after
+ * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
+ * hence we set it to NULL here. */
+
+ p->attached_path = mfree(p->attached_path);
+ return 0;
+}
+
+int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
+ BPFProgram *p;
+ int r;
+
+ SET_FOREACH(p, set) {
+ r = bpf_program_serialize_attachment(f, fds, key, p);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
+ _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
+ _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ ssize_t l;
+ int ifd, at, r;
+
+ assert(v);
+ assert(bpfp);
+
+ /* Extract first word: the fd number */
+ r = extract_first_word(&v, &sfd, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ ifd = parse_fd(sfd);
+ if (ifd < 0)
+ return r;
+
+ /* Extract second word: the attach type */
+ r = extract_first_word(&v, &sat, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ at = bpf_cgroup_attach_type_from_string(sat);
+ if (at < 0)
+ return at;
+
+ /* The rest is the path */
+ if (isempty(v))
+ return -EINVAL;
+
+ l = cunescape(v, 0, &unescaped);
+ if (l < 0)
+ return l;
+
+ fd = fdset_remove(fds, ifd);
+ if (fd < 0)
+ return fd;
+
+ p = new(BPFProgram, 1);
+ if (!p)
+ return -ENOMEM;
+
+ *p = (BPFProgram) {
+ .kernel_fd = TAKE_FD(fd),
+ .prog_type = BPF_PROG_TYPE_UNSPEC,
+ .attached_path = TAKE_PTR(unescaped),
+ .attached_type = at,
+ };
+
+ if (*bpfp)
+ bpf_program_free(*bpfp);
+
+ *bpfp = TAKE_PTR(p);
+ return 0;
+}
+
+int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
+ BPFProgram *p = NULL;
+ int r;
+
+ assert(v);
+ assert(bpfsetp);
+
+ r = bpf_program_deserialize_attachment(v, fds, &p);
+ if (r < 0)
+ return r;
+
+ r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
diff --git a/src/shared/bpf-program.h b/src/shared/bpf-program.h
new file mode 100644
index 0000000..0e0b666
--- /dev/null
+++ b/src/shared/bpf-program.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/syscall.h>
+
+#include "fdset.h"
+#include "list.h"
+#include "macro.h"
+
+typedef struct BPFProgram BPFProgram;
+
+/* This encapsulates three different concepts: the loaded BPF program, the BPF code, and the attachment to a
+ * cgroup. Typically our BPF programs go through all three stages: we build the code, we load it, and finally
+ * we attach it, but it might happen that we operate with programs that aren't loaded or aren't attached, or
+ * where we don't have the code. */
+struct BPFProgram {
+ /* The loaded BPF program, if loaded */
+ int kernel_fd;
+ uint32_t prog_type;
+ char *prog_name;
+
+ /* The code of it BPF program, if known */
+ size_t n_instructions;
+ struct bpf_insn *instructions;
+
+ /* The cgroup path the program is attached to, if it is attached. If non-NULL bpf_program_unref()
+ * will detach on destruction. */
+ char *attached_path;
+ int attached_type;
+ uint32_t attached_flags;
+};
+
+int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret);
+int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret);
+BPFProgram *bpf_program_free(BPFProgram *p);
+
+int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
+int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);
+int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path);
+
+int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags);
+int bpf_program_cgroup_detach(BPFProgram *p);
+
+int bpf_program_pin(int prog_fd, const char *bpffs_path);
+int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id);
+
+int bpf_program_serialize_attachment(FILE *f, FDSet *fds, const char *key, BPFProgram *p);
+int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set);
+int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp);
+int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp);
+
+extern const struct hash_ops bpf_program_hash_ops;
+
+int bpf_map_new(const char *name, enum bpf_map_type type, size_t key_size, size_t value_size,
+ size_t max_entries, uint32_t flags);
+int bpf_map_update_element(int fd, const void *key, void *value);
+int bpf_map_lookup_element(int fd, const void *key, void *value);
+
+int bpf_cgroup_attach_type_from_string(const char *str) _pure_;
+const char *bpf_cgroup_attach_type_to_string(int attach_type) _const_;
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(BPFProgram*, bpf_program_free);
diff --git a/src/shared/bridge-util.c b/src/shared/bridge-util.c
new file mode 100644
index 0000000..e1a8bcb
--- /dev/null
+++ b/src/shared/bridge-util.c
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bridge-util.h"
+#include "string-table.h"
+
+static const char* const bridge_state_table[_NETDEV_BRIDGE_STATE_MAX] = {
+ [NETDEV_BRIDGE_STATE_DISABLED] = "disabled",
+ [NETDEV_BRIDGE_STATE_LISTENING] = "listening",
+ [NETDEV_BRIDGE_STATE_LEARNING] = "learning",
+ [NETDEV_BRIDGE_STATE_FORWARDING] = "forwarding",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(bridge_state, BridgeState);
diff --git a/src/shared/bridge-util.h b/src/shared/bridge-util.h
new file mode 100644
index 0000000..a60891c
--- /dev/null
+++ b/src/shared/bridge-util.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <netinet/in.h>
+#include <linux/if_bridge.h>
+
+#include "conf-parser.h"
+
+typedef enum BridgeState {
+ NETDEV_BRIDGE_STATE_DISABLED = BR_STATE_DISABLED,
+ NETDEV_BRIDGE_STATE_LISTENING = BR_STATE_LISTENING,
+ NETDEV_BRIDGE_STATE_LEARNING = BR_STATE_LEARNING,
+ NETDEV_BRIDGE_STATE_FORWARDING = BR_STATE_FORWARDING,
+ NETDEV_BRIDGE_STATE_BLOCKING = BR_STATE_BLOCKING,
+ _NETDEV_BRIDGE_STATE_MAX,
+ _NETDEV_BRIDGE_STATE_INVALID = -EINVAL,
+} BridgeState;
+
+const char *bridge_state_to_string(BridgeState d) _const_;
+BridgeState bridge_state_from_string(const char *d) _pure_;
diff --git a/src/shared/btrfs-util.c b/src/shared/btrfs-util.c
new file mode 100644
index 0000000..b3e4b50
--- /dev/null
+++ b/src/shared/btrfs-util.c
@@ -0,0 +1,2164 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <linux/btrfs_tree.h>
+#include <linux/fs.h>
+#include <linux/loop.h>
+#include <linux/magic.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "chase.h"
+#include "chattr-util.h"
+#include "copy.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "io-util.h"
+#include "macro.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "smack-util.h"
+#include "sparse-endian.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "time-util.h"
+
+/* WARNING: Be careful with file system ioctls! When we get an fd, we
+ * need to make sure it either refers to only a regular file or
+ * directory, or that it is located on btrfs, before invoking any
+ * btrfs ioctls. The ioctl numbers are reused by some device drivers
+ * (such as DRM), and hence might have bad effects when invoked on
+ * device nodes (that reference drivers) rather than fds to normal
+ * files or directories. */
+
+int btrfs_is_subvol_at(int dir_fd, const char *path) {
+ struct stat st;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+
+ /* On btrfs subvolumes always have the inode 256 */
+
+ if (fstatat(dir_fd, strempty(path), &st, isempty(path) ? AT_EMPTY_PATH : 0) < 0)
+ return -errno;
+
+ if (!btrfs_might_be_subvol(&st))
+ return 0;
+
+ return is_fs_type_at(dir_fd, path, BTRFS_SUPER_MAGIC);
+}
+
+int btrfs_subvol_set_read_only_at(int dir_fd, const char *path, bool b) {
+ _cleanup_close_ int fd = -EBADF;
+ uint64_t flags, nflags;
+ struct stat st;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+
+ fd = xopenat(dir_fd, path, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY, /* xopen_flags = */ 0, /* mode = */ 0);
+ if (fd < 0)
+ return fd;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!btrfs_might_be_subvol(&st))
+ return -EINVAL;
+
+ if (ioctl(fd, BTRFS_IOC_SUBVOL_GETFLAGS, &flags) < 0)
+ return -errno;
+
+ nflags = UPDATE_FLAG(flags, BTRFS_SUBVOL_RDONLY, b);
+ if (flags == nflags)
+ return 0;
+
+ return RET_NERRNO(ioctl(fd, BTRFS_IOC_SUBVOL_SETFLAGS, &nflags));
+}
+
+int btrfs_subvol_get_read_only_fd(int fd) {
+ uint64_t flags;
+ struct stat st;
+
+ assert(fd >= 0);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!btrfs_might_be_subvol(&st))
+ return -EINVAL;
+
+ if (ioctl(fd, BTRFS_IOC_SUBVOL_GETFLAGS, &flags) < 0)
+ return -errno;
+
+ return !!(flags & BTRFS_SUBVOL_RDONLY);
+}
+
+int btrfs_get_block_device_at(int dir_fd, const char *path, dev_t *ret) {
+ struct btrfs_ioctl_fs_info_args fsi = {};
+ _cleanup_close_ int fd = -EBADF;
+ uint64_t id;
+ int r;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(path);
+ assert(ret);
+
+ fd = xopenat(dir_fd, path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, /* xopen_flags = */ 0, /* mode = */ 0);
+ if (fd < 0)
+ return fd;
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ if (ioctl(fd, BTRFS_IOC_FS_INFO, &fsi) < 0)
+ return -errno;
+
+ /* We won't do this for btrfs RAID */
+ if (fsi.num_devices != 1) {
+ *ret = 0;
+ return 0;
+ }
+
+ for (id = 1; id <= fsi.max_id; id++) {
+ struct btrfs_ioctl_dev_info_args di = {
+ .devid = id,
+ };
+ struct stat st;
+
+ if (ioctl(fd, BTRFS_IOC_DEV_INFO, &di) < 0) {
+ if (errno == ENODEV)
+ continue;
+
+ return -errno;
+ }
+
+ /* For the root fs — when no initrd is involved — btrfs returns /dev/root on any kernels from
+ * the past few years. That sucks, as we have no API to determine the actual root then. let's
+ * return an recognizable error for this case, so that the caller can maybe print a nice
+ * message about this.
+ *
+ * https://bugzilla.kernel.org/show_bug.cgi?id=89721 */
+ if (path_equal((char*) di.path, "/dev/root"))
+ return -EUCLEAN;
+
+ if (stat((char*) di.path, &st) < 0)
+ return -errno;
+
+ if (!S_ISBLK(st.st_mode))
+ return -ENOTBLK;
+
+ if (major(st.st_rdev) == 0)
+ return -ENODEV;
+
+ *ret = st.st_rdev;
+ return 1;
+ }
+
+ return -ENODEV;
+}
+
+int btrfs_subvol_get_id_fd(int fd, uint64_t *ret) {
+ struct btrfs_ioctl_ino_lookup_args args = {
+ .objectid = BTRFS_FIRST_FREE_OBJECTID
+ };
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ if (ioctl(fd, BTRFS_IOC_INO_LOOKUP, &args) < 0)
+ return -errno;
+
+ *ret = args.treeid;
+ return 0;
+}
+
+int btrfs_subvol_get_id(int fd, const char *subvol, uint64_t *ret) {
+ _cleanup_close_ int subvol_fd = -EBADF;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ subvol_fd = openat(fd, subvol, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (subvol_fd < 0)
+ return -errno;
+
+ return btrfs_subvol_get_id_fd(subvol_fd, ret);
+}
+
+static bool btrfs_ioctl_search_args_inc(struct btrfs_ioctl_search_args *args) {
+ assert(args);
+
+ /* the objectid, type, offset together make up the btrfs key,
+ * which is considered a single 136byte integer when
+ * comparing. This call increases the counter by one, dealing
+ * with the overflow between the overflows */
+
+ if (args->key.min_offset < UINT64_MAX) {
+ args->key.min_offset++;
+ return true;
+ }
+
+ if (args->key.min_type < UINT8_MAX) {
+ args->key.min_type++;
+ args->key.min_offset = 0;
+ return true;
+ }
+
+ if (args->key.min_objectid < UINT64_MAX) {
+ args->key.min_objectid++;
+ args->key.min_offset = 0;
+ args->key.min_type = 0;
+ return true;
+ }
+
+ return 0;
+}
+
+static void btrfs_ioctl_search_args_set(struct btrfs_ioctl_search_args *args, const struct btrfs_ioctl_search_header *h) {
+ assert(args);
+ assert(h);
+
+ args->key.min_objectid = h->objectid;
+ args->key.min_type = h->type;
+ args->key.min_offset = h->offset;
+}
+
+static int btrfs_ioctl_search_args_compare(const struct btrfs_ioctl_search_args *args) {
+ int r;
+
+ assert(args);
+
+ /* Compare min and max */
+
+ r = CMP(args->key.min_objectid, args->key.max_objectid);
+ if (r != 0)
+ return r;
+
+ r = CMP(args->key.min_type, args->key.max_type);
+ if (r != 0)
+ return r;
+
+ return CMP(args->key.min_offset, args->key.max_offset);
+}
+
+typedef struct BtrfsForeachIterator {
+ const void *p;
+ size_t i;
+} BtrfsForeachIterator;
+
+/* Iterates through a series of struct btrfs_file_extent_item elements. They are unfortunately not aligned,
+ * hence we copy out the header from them */
+#define FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) \
+ for (BtrfsForeachIterator iterator = { \
+ .p = ({ \
+ memcpy(&(sh), (args).buf, sizeof(struct btrfs_ioctl_search_header)); \
+ (body) = (const void*) ((const uint8_t*) (args).buf + sizeof(struct btrfs_ioctl_search_header)); \
+ (args).buf; \
+ }), \
+ }; \
+ iterator.i < (args).key.nr_items; \
+ iterator.i++, \
+ memcpy(&(sh), iterator.p = (const uint8_t*) iterator.p + sizeof(struct btrfs_ioctl_search_header) + (sh).len, sizeof(struct btrfs_ioctl_search_header)), \
+ (body) = (const void*) ((const uint8_t*) iterator.p + sizeof(struct btrfs_ioctl_search_header)))
+
+int btrfs_subvol_get_info_fd(int fd, uint64_t subvol_id, BtrfsSubvolInfo *ret) {
+ struct btrfs_ioctl_search_args args = {
+ /* Tree of tree roots */
+ .key.tree_id = BTRFS_ROOT_TREE_OBJECTID,
+
+ /* Look precisely for the subvolume items */
+ .key.min_type = BTRFS_ROOT_ITEM_KEY,
+ .key.max_type = BTRFS_ROOT_ITEM_KEY,
+
+ .key.min_offset = 0,
+ .key.max_offset = UINT64_MAX,
+
+ /* No restrictions on the other components */
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ bool found = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (subvol_id == 0) {
+ r = btrfs_subvol_get_id_fd(fd, &subvol_id);
+ if (r < 0)
+ return r;
+ } else {
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+ }
+
+ args.key.min_objectid = args.key.max_objectid = subvol_id;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ args.key.nr_items = 256;
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0)
+ return -errno;
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+ /* Make sure we start the next search at least from this entry */
+ btrfs_ioctl_search_args_set(&args, &sh);
+
+ if (sh.objectid != subvol_id)
+ continue;
+ if (sh.type != BTRFS_ROOT_ITEM_KEY)
+ continue;
+
+ /* Older versions of the struct lacked the otime setting */
+ if (sh.len < offsetof(struct btrfs_root_item, otime) + sizeof(struct btrfs_timespec))
+ continue;
+
+ const struct btrfs_root_item *ri = body;
+ ret->otime = (usec_t) le64toh(ri->otime.sec) * USEC_PER_SEC +
+ (usec_t) le32toh(ri->otime.nsec) / NSEC_PER_USEC;
+
+ ret->subvol_id = subvol_id;
+ ret->read_only = le64toh(ri->flags) & BTRFS_ROOT_SUBVOL_RDONLY;
+
+ assert_cc(sizeof(ri->uuid) == sizeof(ret->uuid));
+ memcpy(&ret->uuid, ri->uuid, sizeof(ret->uuid));
+ memcpy(&ret->parent_uuid, ri->parent_uuid, sizeof(ret->parent_uuid));
+
+ found = true;
+ goto finish;
+ }
+
+ /* Increase search key by one, to read the next item, if we can. */
+ if (!btrfs_ioctl_search_args_inc(&args))
+ break;
+ }
+
+finish:
+ return found ? 0 : -ENODATA;
+}
+
+int btrfs_qgroup_get_quota_fd(int fd, uint64_t qgroupid, BtrfsQuotaInfo *ret) {
+
+ struct btrfs_ioctl_search_args args = {
+ /* Tree of quota items */
+ .key.tree_id = BTRFS_QUOTA_TREE_OBJECTID,
+
+ /* The object ID is always 0 */
+ .key.min_objectid = 0,
+ .key.max_objectid = 0,
+
+ /* Look precisely for the quota items */
+ .key.min_type = BTRFS_QGROUP_STATUS_KEY,
+ .key.max_type = BTRFS_QGROUP_LIMIT_KEY,
+
+ /* No restrictions on the other components */
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ bool found_info = false, found_limit = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (qgroupid == 0) {
+ r = btrfs_subvol_get_id_fd(fd, &qgroupid);
+ if (r < 0)
+ return r;
+ } else {
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+ }
+
+ args.key.min_offset = args.key.max_offset = qgroupid;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ args.key.nr_items = 256;
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) {
+ if (errno == ENOENT) /* quota tree is missing: quota disabled */
+ break;
+
+ return -errno;
+ }
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+
+ /* Make sure we start the next search at least from this entry */
+ btrfs_ioctl_search_args_set(&args, &sh);
+
+ if (sh.objectid != 0)
+ continue;
+ if (sh.offset != qgroupid)
+ continue;
+
+ if (sh.type == BTRFS_QGROUP_INFO_KEY) {
+ const struct btrfs_qgroup_info_item *qii = body;
+
+ ret->referenced = le64toh(qii->rfer);
+ ret->exclusive = le64toh(qii->excl);
+
+ found_info = true;
+
+ } else if (sh.type == BTRFS_QGROUP_LIMIT_KEY) {
+ const struct btrfs_qgroup_limit_item *qli = body;
+
+ if (le64toh(qli->flags) & BTRFS_QGROUP_LIMIT_MAX_RFER)
+ ret->referenced_max = le64toh(qli->max_rfer);
+ else
+ ret->referenced_max = UINT64_MAX;
+
+ if (le64toh(qli->flags) & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+ ret->exclusive_max = le64toh(qli->max_excl);
+ else
+ ret->exclusive_max = UINT64_MAX;
+
+ found_limit = true;
+ }
+
+ if (found_info && found_limit)
+ goto finish;
+ }
+
+ /* Increase search key by one, to read the next item, if we can. */
+ if (!btrfs_ioctl_search_args_inc(&args))
+ break;
+ }
+
+finish:
+ if (!found_limit && !found_info)
+ return -ENODATA;
+
+ if (!found_info) {
+ ret->referenced = UINT64_MAX;
+ ret->exclusive = UINT64_MAX;
+ }
+
+ if (!found_limit) {
+ ret->referenced_max = UINT64_MAX;
+ ret->exclusive_max = UINT64_MAX;
+ }
+
+ return 0;
+}
+
+int btrfs_qgroup_get_quota(const char *path, uint64_t qgroupid, BtrfsQuotaInfo *ret) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_qgroup_get_quota_fd(fd, qgroupid, ret);
+}
+
+int btrfs_subvol_find_subtree_qgroup(int fd, uint64_t subvol_id, uint64_t *ret) {
+ uint64_t level, lowest = UINT64_MAX, lowest_qgroupid = 0;
+ _cleanup_free_ uint64_t *qgroups = NULL;
+ int r, n;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ /* This finds the "subtree" qgroup for a specific
+ * subvolume. This only works for subvolumes that have been
+ * prepared with btrfs_subvol_auto_qgroup_fd() with
+ * insert_intermediary_qgroup=true (or equivalent). For others
+ * it will return the leaf qgroup instead. The two cases may
+ * be distinguished via the return value, which is 1 in case
+ * an appropriate "subtree" qgroup was found, and 0
+ * otherwise. */
+
+ if (subvol_id == 0) {
+ r = btrfs_subvol_get_id_fd(fd, &subvol_id);
+ if (r < 0)
+ return r;
+ }
+
+ r = btrfs_qgroupid_split(subvol_id, &level, NULL);
+ if (r < 0)
+ return r;
+ if (level != 0) /* Input must be a leaf qgroup */
+ return -EINVAL;
+
+ n = btrfs_qgroup_find_parents(fd, subvol_id, &qgroups);
+ if (n < 0)
+ return n;
+
+ for (int i = 0; i < n; i++) {
+ uint64_t id;
+
+ r = btrfs_qgroupid_split(qgroups[i], &level, &id);
+ if (r < 0)
+ return r;
+
+ if (id != subvol_id)
+ continue;
+
+ if (lowest == UINT64_MAX || level < lowest) {
+ lowest_qgroupid = qgroups[i];
+ lowest = level;
+ }
+ }
+
+ if (lowest == UINT64_MAX) {
+ /* No suitable higher-level qgroup found, let's return
+ * the leaf qgroup instead, and indicate that with the
+ * return value. */
+
+ *ret = subvol_id;
+ return 0;
+ }
+
+ *ret = lowest_qgroupid;
+ return 1;
+}
+
+int btrfs_subvol_get_subtree_quota_fd(int fd, uint64_t subvol_id, BtrfsQuotaInfo *ret) {
+ uint64_t qgroupid;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ /* This determines the quota data of the qgroup with the
+ * lowest level, that shares the id part with the specified
+ * subvolume. This is useful for determining the quota data
+ * for entire subvolume subtrees, as long as the subtrees have
+ * been set up with btrfs_qgroup_subvol_auto_fd() or in a
+ * compatible way */
+
+ r = btrfs_subvol_find_subtree_qgroup(fd, subvol_id, &qgroupid);
+ if (r < 0)
+ return r;
+
+ return btrfs_qgroup_get_quota_fd(fd, qgroupid, ret);
+}
+
+int btrfs_subvol_get_subtree_quota(const char *path, uint64_t subvol_id, BtrfsQuotaInfo *ret) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_subvol_get_subtree_quota_fd(fd, subvol_id, ret);
+}
+
+int btrfs_defrag_fd(int fd) {
+ int r;
+
+ assert(fd >= 0);
+
+ r = fd_verify_regular(fd);
+ if (r < 0)
+ return r;
+
+ return RET_NERRNO(ioctl(fd, BTRFS_IOC_DEFRAG, NULL));
+}
+
+int btrfs_defrag(const char *p) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(p, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_defrag_fd(fd);
+}
+
+int btrfs_quota_enable_fd(int fd, bool b) {
+ struct btrfs_ioctl_quota_ctl_args args = {
+ .cmd = b ? BTRFS_QUOTA_CTL_ENABLE : BTRFS_QUOTA_CTL_DISABLE,
+ };
+ int r;
+
+ assert(fd >= 0);
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ return RET_NERRNO(ioctl(fd, BTRFS_IOC_QUOTA_CTL, &args));
+}
+
+int btrfs_quota_enable(const char *path, bool b) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_quota_enable_fd(fd, b);
+}
+
+int btrfs_qgroup_set_limit_fd(int fd, uint64_t qgroupid, uint64_t referenced_max) {
+
+ struct btrfs_ioctl_qgroup_limit_args args = {
+ .lim.max_rfer = referenced_max,
+ .lim.flags = BTRFS_QGROUP_LIMIT_MAX_RFER,
+ };
+ int r;
+
+ assert(fd >= 0);
+
+ if (qgroupid == 0) {
+ r = btrfs_subvol_get_id_fd(fd, &qgroupid);
+ if (r < 0)
+ return r;
+ } else {
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+ }
+
+ args.qgroupid = qgroupid;
+
+ for (unsigned c = 0;; c++) {
+ if (ioctl(fd, BTRFS_IOC_QGROUP_LIMIT, &args) < 0) {
+
+ if (errno == EBUSY && c < 10) {
+ (void) btrfs_quota_scan_wait(fd);
+ continue;
+ }
+
+ return -errno;
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+int btrfs_qgroup_set_limit(const char *path, uint64_t qgroupid, uint64_t referenced_max) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_qgroup_set_limit_fd(fd, qgroupid, referenced_max);
+}
+
+int btrfs_subvol_set_subtree_quota_limit_fd(int fd, uint64_t subvol_id, uint64_t referenced_max) {
+ uint64_t qgroupid;
+ int r;
+
+ assert(fd >= 0);
+
+ r = btrfs_subvol_find_subtree_qgroup(fd, subvol_id, &qgroupid);
+ if (r < 0)
+ return r;
+
+ return btrfs_qgroup_set_limit_fd(fd, qgroupid, referenced_max);
+}
+
+int btrfs_subvol_set_subtree_quota_limit(const char *path, uint64_t subvol_id, uint64_t referenced_max) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_subvol_set_subtree_quota_limit_fd(fd, subvol_id, referenced_max);
+}
+
+int btrfs_qgroupid_make(uint64_t level, uint64_t id, uint64_t *ret) {
+ assert(ret);
+
+ if (level >= (UINT64_C(1) << (64 - BTRFS_QGROUP_LEVEL_SHIFT)))
+ return -EINVAL;
+
+ if (id >= (UINT64_C(1) << BTRFS_QGROUP_LEVEL_SHIFT))
+ return -EINVAL;
+
+ *ret = (level << BTRFS_QGROUP_LEVEL_SHIFT) | id;
+ return 0;
+}
+
+int btrfs_qgroupid_split(uint64_t qgroupid, uint64_t *level, uint64_t *id) {
+ assert(level || id);
+
+ if (level)
+ *level = qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+
+ if (id)
+ *id = qgroupid & ((UINT64_C(1) << BTRFS_QGROUP_LEVEL_SHIFT) - 1);
+
+ return 0;
+}
+
+static int qgroup_create_or_destroy(int fd, bool b, uint64_t qgroupid) {
+
+ struct btrfs_ioctl_qgroup_create_args args = {
+ .create = b,
+ .qgroupid = qgroupid,
+ };
+ int r;
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ for (unsigned c = 0;; c++) {
+ if (ioctl(fd, BTRFS_IOC_QGROUP_CREATE, &args) < 0) {
+
+ /* On old kernels if quota is not enabled, we get EINVAL. On newer kernels we get
+ * ENOTCONN. Let's always convert this to ENOTCONN to make this recognizable
+ * everywhere the same way. */
+
+ if (IN_SET(errno, EINVAL, ENOTCONN))
+ return -ENOTCONN;
+
+ if (errno == EBUSY && c < 10) {
+ (void) btrfs_quota_scan_wait(fd);
+ continue;
+ }
+
+ return -errno;
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+int btrfs_qgroup_create(int fd, uint64_t qgroupid) {
+ return qgroup_create_or_destroy(fd, true, qgroupid);
+}
+
+int btrfs_qgroup_destroy(int fd, uint64_t qgroupid) {
+ return qgroup_create_or_destroy(fd, false, qgroupid);
+}
+
+int btrfs_qgroup_destroy_recursive(int fd, uint64_t qgroupid) {
+ _cleanup_free_ uint64_t *qgroups = NULL;
+ uint64_t subvol_id;
+ int n, r;
+
+ /* Destroys the specified qgroup, but unassigns it from all
+ * its parents first. Also, it recursively destroys all
+ * qgroups it is assigned to that have the same id part of the
+ * qgroupid as the specified group. */
+
+ r = btrfs_qgroupid_split(qgroupid, NULL, &subvol_id);
+ if (r < 0)
+ return r;
+
+ n = btrfs_qgroup_find_parents(fd, qgroupid, &qgroups);
+ if (n < 0)
+ return n;
+
+ for (int i = 0; i < n; i++) {
+ uint64_t id;
+
+ r = btrfs_qgroupid_split(qgroups[i], NULL, &id);
+ if (r < 0)
+ return r;
+
+ r = btrfs_qgroup_unassign(fd, qgroupid, qgroups[i]);
+ if (r < 0)
+ return r;
+
+ if (id != subvol_id)
+ continue;
+
+ /* The parent qgroupid shares the same id part with
+ * us? If so, destroy it too. */
+
+ (void) btrfs_qgroup_destroy_recursive(fd, qgroups[i]);
+ }
+
+ return btrfs_qgroup_destroy(fd, qgroupid);
+}
+
+int btrfs_quota_scan_start(int fd) {
+ struct btrfs_ioctl_quota_rescan_args args = {};
+
+ assert(fd >= 0);
+
+ return RET_NERRNO(ioctl(fd, BTRFS_IOC_QUOTA_RESCAN, &args));
+}
+
+int btrfs_quota_scan_wait(int fd) {
+ assert(fd >= 0);
+
+ return RET_NERRNO(ioctl(fd, BTRFS_IOC_QUOTA_RESCAN_WAIT));
+}
+
+int btrfs_quota_scan_ongoing(int fd) {
+ struct btrfs_ioctl_quota_rescan_args args = {};
+
+ assert(fd >= 0);
+
+ if (ioctl(fd, BTRFS_IOC_QUOTA_RESCAN_STATUS, &args) < 0)
+ return -errno;
+
+ return !!args.flags;
+}
+
+static int qgroup_assign_or_unassign(int fd, bool b, uint64_t child, uint64_t parent) {
+ struct btrfs_ioctl_qgroup_assign_args args = {
+ .assign = b,
+ .src = child,
+ .dst = parent,
+ };
+ int r;
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ for (unsigned c = 0;; c++) {
+ r = ioctl(fd, BTRFS_IOC_QGROUP_ASSIGN, &args);
+ if (r < 0) {
+ if (errno == EBUSY && c < 10) {
+ (void) btrfs_quota_scan_wait(fd);
+ continue;
+ }
+
+ return -errno;
+ }
+
+ if (r == 0)
+ return 0;
+
+ /* If the return value is > 0, we need to request a rescan */
+
+ (void) btrfs_quota_scan_start(fd);
+ return 1;
+ }
+}
+
+int btrfs_qgroup_assign(int fd, uint64_t child, uint64_t parent) {
+ return qgroup_assign_or_unassign(fd, true, child, parent);
+}
+
+int btrfs_qgroup_unassign(int fd, uint64_t child, uint64_t parent) {
+ return qgroup_assign_or_unassign(fd, false, child, parent);
+}
+
+static int subvol_remove_children(int fd, const char *subvolume, uint64_t subvol_id, BtrfsRemoveFlags flags) {
+ struct btrfs_ioctl_search_args args = {
+ .key.tree_id = BTRFS_ROOT_TREE_OBJECTID,
+
+ .key.min_objectid = BTRFS_FIRST_FREE_OBJECTID,
+ .key.max_objectid = BTRFS_LAST_FREE_OBJECTID,
+
+ .key.min_type = BTRFS_ROOT_BACKREF_KEY,
+ .key.max_type = BTRFS_ROOT_BACKREF_KEY,
+
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ struct btrfs_ioctl_vol_args vol_args = {};
+ _cleanup_close_ int subvol_fd = -EBADF;
+ struct stat st;
+ bool made_writable = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(subvolume);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISDIR(st.st_mode))
+ return -EINVAL;
+
+ subvol_fd = openat(fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+ if (subvol_fd < 0)
+ return -errno;
+
+ /* Let's check if this is actually a subvolume. Note that this is mostly redundant, as BTRFS_IOC_SNAP_DESTROY
+ * would fail anyway if it is not. However, it's a good thing to check this ahead of time so that we can return
+ * ENOTTY unconditionally in this case. This is different from the ioctl() which will return EPERM/EACCES if we
+ * don't have the privileges to remove subvolumes, regardless if the specified directory is actually a
+ * subvolume or not. In order to make it easy for callers to cover the "this is not a btrfs subvolume" case
+ * let's prefer ENOTTY over EPERM/EACCES though. */
+ r = btrfs_is_subvol_fd(subvol_fd);
+ if (r < 0)
+ return r;
+ if (r == 0) /* Not a btrfs subvolume */
+ return -ENOTTY;
+
+ if (subvol_id == 0) {
+ r = btrfs_subvol_get_id_fd(subvol_fd, &subvol_id);
+ if (r < 0)
+ return r;
+ }
+
+ /* First, try to remove the subvolume. If it happens to be
+ * already empty, this will just work. */
+ strncpy(vol_args.name, subvolume, sizeof(vol_args.name)-1);
+ if (ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args) >= 0) {
+ (void) btrfs_qgroup_destroy_recursive(fd, subvol_id); /* for the leaf subvolumes, the qgroup id is identical to the subvol id */
+ return 0;
+ }
+ if (!(flags & BTRFS_REMOVE_RECURSIVE) || errno != ENOTEMPTY)
+ return -errno;
+
+ /* OK, the subvolume is not empty, let's look for child
+ * subvolumes, and remove them, first */
+
+ args.key.min_offset = args.key.max_offset = subvol_id;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ args.key.nr_items = 256;
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0)
+ return -errno;
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+ _cleanup_free_ char *p = NULL;
+
+ btrfs_ioctl_search_args_set(&args, &sh);
+
+ if (sh.type != BTRFS_ROOT_BACKREF_KEY)
+ continue;
+ if (sh.offset != subvol_id)
+ continue;
+
+ const struct btrfs_root_ref *ref = body;
+ p = memdup_suffix0((char*) ref + sizeof(struct btrfs_root_ref), le64toh(ref->name_len));
+ if (!p)
+ return -ENOMEM;
+
+ struct btrfs_ioctl_ino_lookup_args ino_args = {
+ .treeid = subvol_id,
+ .objectid = htole64(ref->dirid),
+ };
+
+ if (ioctl(fd, BTRFS_IOC_INO_LOOKUP, &ino_args) < 0)
+ return -errno;
+
+ if (!made_writable) {
+ r = btrfs_subvol_set_read_only_fd(subvol_fd, false);
+ if (r < 0)
+ return r;
+
+ made_writable = true;
+ }
+
+ if (isempty(ino_args.name))
+ /* Subvolume is in the top-level
+ * directory of the subvolume. */
+ r = subvol_remove_children(subvol_fd, p, sh.objectid, flags);
+ else {
+ _cleanup_close_ int child_fd = -EBADF;
+
+ /* Subvolume is somewhere further down,
+ * hence we need to open the
+ * containing directory first */
+
+ child_fd = openat(subvol_fd, ino_args.name, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+ if (child_fd < 0)
+ return -errno;
+
+ r = subvol_remove_children(child_fd, p, sh.objectid, flags);
+ }
+ if (r < 0)
+ return r;
+ }
+
+ /* Increase search key by one, to read the next item, if we can. */
+ if (!btrfs_ioctl_search_args_inc(&args))
+ break;
+ }
+
+ /* OK, the child subvolumes should all be gone now, let's try
+ * again to remove the subvolume */
+ if (ioctl(fd, BTRFS_IOC_SNAP_DESTROY, &vol_args) < 0)
+ return -errno;
+
+ (void) btrfs_qgroup_destroy_recursive(fd, subvol_id);
+ return 0;
+}
+
+int btrfs_subvol_remove_at(int dir_fd, const char *path, BtrfsRemoveFlags flags) {
+ _cleanup_free_ char *subvolume = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(path);
+
+ fd = chase_and_openat(dir_fd, path, CHASE_PARENT|CHASE_EXTRACT_FILENAME, O_CLOEXEC, &subvolume);
+ if (fd < 0)
+ return fd;
+
+ r = btrfs_validate_subvolume_name(subvolume);
+ if (r < 0)
+ return r;
+
+ return subvol_remove_children(fd, subvolume, 0, flags);
+}
+
+int btrfs_qgroup_copy_limits(int fd, uint64_t old_qgroupid, uint64_t new_qgroupid) {
+
+ struct btrfs_ioctl_search_args args = {
+ /* Tree of quota items */
+ .key.tree_id = BTRFS_QUOTA_TREE_OBJECTID,
+
+ /* The object ID is always 0 */
+ .key.min_objectid = 0,
+ .key.max_objectid = 0,
+
+ /* Look precisely for the quota items */
+ .key.min_type = BTRFS_QGROUP_LIMIT_KEY,
+ .key.max_type = BTRFS_QGROUP_LIMIT_KEY,
+
+ /* For our qgroup */
+ .key.min_offset = old_qgroupid,
+ .key.max_offset = old_qgroupid,
+
+ /* No restrictions on the other components */
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ int r;
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ args.key.nr_items = 256;
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) {
+ if (errno == ENOENT) /* quota tree missing: quota is not enabled, hence nothing to copy */
+ break;
+
+ return -errno;
+ }
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+ struct btrfs_ioctl_qgroup_limit_args qargs;
+ unsigned c;
+
+ /* Make sure we start the next search at least from this entry */
+ btrfs_ioctl_search_args_set(&args, &sh);
+
+ if (sh.objectid != 0)
+ continue;
+ if (sh.type != BTRFS_QGROUP_LIMIT_KEY)
+ continue;
+ if (sh.offset != old_qgroupid)
+ continue;
+
+ /* We found the entry, now copy things over. */
+
+ const struct btrfs_qgroup_limit_item *qli = body;
+ qargs = (struct btrfs_ioctl_qgroup_limit_args) {
+ .qgroupid = new_qgroupid,
+
+ .lim.max_rfer = le64toh(qli->max_rfer),
+ .lim.max_excl = le64toh(qli->max_excl),
+ .lim.rsv_rfer = le64toh(qli->rsv_rfer),
+ .lim.rsv_excl = le64toh(qli->rsv_excl),
+
+ .lim.flags = le64toh(qli->flags) & (BTRFS_QGROUP_LIMIT_MAX_RFER|
+ BTRFS_QGROUP_LIMIT_MAX_EXCL|
+ BTRFS_QGROUP_LIMIT_RSV_RFER|
+ BTRFS_QGROUP_LIMIT_RSV_EXCL),
+ };
+
+ for (c = 0;; c++) {
+ if (ioctl(fd, BTRFS_IOC_QGROUP_LIMIT, &qargs) < 0) {
+ if (errno == EBUSY && c < 10) {
+ (void) btrfs_quota_scan_wait(fd);
+ continue;
+ }
+ return -errno;
+ }
+
+ break;
+ }
+
+ return 1;
+ }
+
+ /* Increase search key by one, to read the next item, if we can. */
+ if (!btrfs_ioctl_search_args_inc(&args))
+ break;
+ }
+
+ return 0;
+}
+
+static int copy_quota_hierarchy(int fd, uint64_t old_subvol_id, uint64_t new_subvol_id) {
+ _cleanup_free_ uint64_t *old_qgroups = NULL, *old_parent_qgroups = NULL;
+ bool copy_from_parent = false, insert_intermediary_qgroup = false;
+ int n_old_qgroups, n_old_parent_qgroups, r;
+ uint64_t old_parent_id;
+
+ assert(fd >= 0);
+
+ /* Copies a reduced form of quota information from the old to
+ * the new subvolume. */
+
+ n_old_qgroups = btrfs_qgroup_find_parents(fd, old_subvol_id, &old_qgroups);
+ if (n_old_qgroups <= 0) /* Nothing to copy */
+ return n_old_qgroups;
+
+ r = btrfs_subvol_get_parent(fd, old_subvol_id, &old_parent_id);
+ if (r == -ENXIO)
+ /* We have no parent, hence nothing to copy. */
+ n_old_parent_qgroups = 0;
+ else if (r < 0)
+ return r;
+ else {
+ n_old_parent_qgroups = btrfs_qgroup_find_parents(fd, old_parent_id, &old_parent_qgroups);
+ if (n_old_parent_qgroups < 0)
+ return n_old_parent_qgroups;
+ }
+
+ for (int i = 0; i < n_old_qgroups; i++) {
+ uint64_t id;
+
+ r = btrfs_qgroupid_split(old_qgroups[i], NULL, &id);
+ if (r < 0)
+ return r;
+
+ if (id == old_subvol_id) {
+ /* The old subvolume was member of a qgroup
+ * that had the same id, but a different level
+ * as it self. Let's set up something similar
+ * in the destination. */
+ insert_intermediary_qgroup = true;
+ break;
+ }
+
+ for (int j = 0; j < n_old_parent_qgroups; j++)
+ if (old_parent_qgroups[j] == old_qgroups[i])
+ /* The old subvolume shared a common
+ * parent qgroup with its parent
+ * subvolume. Let's set up something
+ * similar in the destination. */
+ copy_from_parent = true;
+ }
+
+ if (!insert_intermediary_qgroup && !copy_from_parent)
+ return 0;
+
+ return btrfs_subvol_auto_qgroup_fd(fd, new_subvol_id, insert_intermediary_qgroup);
+}
+
+static int copy_subtree_quota_limits(int fd, uint64_t old_subvol, uint64_t new_subvol) {
+ uint64_t old_subtree_qgroup, new_subtree_qgroup;
+ bool changed;
+ int r;
+
+ /* First copy the leaf limits */
+ r = btrfs_qgroup_copy_limits(fd, old_subvol, new_subvol);
+ if (r < 0)
+ return r;
+ changed = r > 0;
+
+ /* Then, try to copy the subtree limits, if there are any. */
+ r = btrfs_subvol_find_subtree_qgroup(fd, old_subvol, &old_subtree_qgroup);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return changed;
+
+ r = btrfs_subvol_find_subtree_qgroup(fd, new_subvol, &new_subtree_qgroup);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return changed;
+
+ r = btrfs_qgroup_copy_limits(fd, old_subtree_qgroup, new_subtree_qgroup);
+ if (r != 0)
+ return r;
+
+ return changed;
+}
+
+static int subvol_snapshot_children(
+ int old_fd,
+ int new_fd,
+ const char *subvolume,
+ uint64_t old_subvol_id,
+ BtrfsSnapshotFlags flags) {
+
+ struct btrfs_ioctl_search_args args = {
+ .key.tree_id = BTRFS_ROOT_TREE_OBJECTID,
+
+ .key.min_objectid = BTRFS_FIRST_FREE_OBJECTID,
+ .key.max_objectid = BTRFS_LAST_FREE_OBJECTID,
+
+ .key.min_type = BTRFS_ROOT_BACKREF_KEY,
+ .key.max_type = BTRFS_ROOT_BACKREF_KEY,
+
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ struct btrfs_ioctl_vol_args_v2 vol_args = {
+ .flags = flags & BTRFS_SNAPSHOT_READ_ONLY ? BTRFS_SUBVOL_RDONLY : 0,
+ .fd = old_fd,
+ };
+ _cleanup_close_ int subvolume_fd = -EBADF;
+ uint64_t new_subvol_id;
+ int r;
+
+ assert(old_fd >= 0);
+ assert(new_fd >= 0);
+ assert(subvolume);
+
+ strncpy(vol_args.name, subvolume, sizeof(vol_args.name)-1);
+
+ if (ioctl(new_fd, BTRFS_IOC_SNAP_CREATE_V2, &vol_args) < 0)
+ return -errno;
+
+ if (FLAGS_SET(flags, BTRFS_SNAPSHOT_LOCK_BSD)) {
+ subvolume_fd = xopenat_lock(new_fd, subvolume,
+ O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW,
+ /* xopen_flags = */ 0,
+ /* mode = */ 0,
+ LOCK_BSD,
+ LOCK_EX);
+ if (subvolume_fd < 0)
+ return subvolume_fd;
+
+ r = btrfs_is_subvol_fd(subvolume_fd);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EEXIST;
+ }
+
+ if (!(flags & BTRFS_SNAPSHOT_RECURSIVE) &&
+ !(flags & BTRFS_SNAPSHOT_QUOTA))
+ return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0;
+
+ if (old_subvol_id == 0) {
+ r = btrfs_subvol_get_id_fd(old_fd, &old_subvol_id);
+ if (r < 0)
+ return r;
+ }
+
+ r = btrfs_subvol_get_id(new_fd, vol_args.name, &new_subvol_id);
+ if (r < 0)
+ return r;
+
+ if (flags & BTRFS_SNAPSHOT_QUOTA)
+ (void) copy_quota_hierarchy(new_fd, old_subvol_id, new_subvol_id);
+
+ if (!(flags & BTRFS_SNAPSHOT_RECURSIVE)) {
+
+ if (flags & BTRFS_SNAPSHOT_QUOTA)
+ (void) copy_subtree_quota_limits(new_fd, old_subvol_id, new_subvol_id);
+
+ return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0;
+ }
+
+ args.key.min_offset = args.key.max_offset = old_subvol_id;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ args.key.nr_items = 256;
+ if (ioctl(old_fd, BTRFS_IOC_TREE_SEARCH, &args) < 0)
+ return -errno;
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+ _cleanup_free_ char *p = NULL, *c = NULL, *np = NULL;
+ _cleanup_close_ int old_child_fd = -EBADF, new_child_fd = -EBADF;
+
+ btrfs_ioctl_search_args_set(&args, &sh);
+
+ if (sh.type != BTRFS_ROOT_BACKREF_KEY)
+ continue;
+
+ /* Avoid finding the source subvolume a second time */
+ if (sh.offset != old_subvol_id)
+ continue;
+
+ /* Avoid running into loops if the new subvolume is below the old one. */
+ if (sh.objectid == new_subvol_id)
+ continue;
+
+ const struct btrfs_root_ref *ref = body;
+ p = memdup_suffix0((char*) ref + sizeof(struct btrfs_root_ref), le64toh(ref->name_len));
+ if (!p)
+ return -ENOMEM;
+
+ struct btrfs_ioctl_ino_lookup_args ino_args = {
+ .treeid = old_subvol_id,
+ .objectid = htole64(ref->dirid),
+ };
+
+ if (ioctl(old_fd, BTRFS_IOC_INO_LOOKUP, &ino_args) < 0)
+ return -errno;
+
+ c = path_join(ino_args.name, p);
+ if (!c)
+ return -ENOMEM;
+
+ old_child_fd = openat(old_fd, c, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+ if (old_child_fd < 0)
+ return -errno;
+
+ np = path_join(subvolume, ino_args.name);
+ if (!np)
+ return -ENOMEM;
+
+ new_child_fd = openat(new_fd, np, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+ if (new_child_fd < 0)
+ return -errno;
+
+ if (flags & BTRFS_SNAPSHOT_READ_ONLY) {
+ /* If the snapshot is read-only we need to mark it writable temporarily, to
+ * put the subsnapshot into place. */
+
+ if (subvolume_fd < 0) {
+ subvolume_fd = openat(new_fd, subvolume, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW);
+ if (subvolume_fd < 0)
+ return -errno;
+ }
+
+ r = btrfs_subvol_set_read_only_fd(subvolume_fd, false);
+ if (r < 0)
+ return r;
+ }
+
+ /* When btrfs clones the subvolumes, child subvolumes appear as empty
+ * directories. Remove them, so that we can create a new snapshot in their place */
+ if (unlinkat(new_child_fd, p, AT_REMOVEDIR) < 0) {
+ int k = -errno;
+
+ if (flags & BTRFS_SNAPSHOT_READ_ONLY)
+ (void) btrfs_subvol_set_read_only_fd(subvolume_fd, true);
+
+ return k;
+ }
+
+ r = subvol_snapshot_children(old_child_fd, new_child_fd, p, sh.objectid,
+ flags & ~(BTRFS_SNAPSHOT_FALLBACK_COPY|BTRFS_SNAPSHOT_LOCK_BSD));
+
+ /* Restore the readonly flag */
+ if (flags & BTRFS_SNAPSHOT_READ_ONLY) {
+ int k;
+
+ k = btrfs_subvol_set_read_only_fd(subvolume_fd, true);
+ if (r >= 0 && k < 0)
+ return k;
+ }
+
+ if (r < 0)
+ return r;
+ }
+
+ /* Increase search key by one, to read the next item, if we can. */
+ if (!btrfs_ioctl_search_args_inc(&args))
+ break;
+ }
+
+ if (flags & BTRFS_SNAPSHOT_QUOTA)
+ (void) copy_subtree_quota_limits(new_fd, old_subvol_id, new_subvol_id);
+
+ return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0;
+}
+
+int btrfs_subvol_snapshot_at_full(
+ int dir_fdf,
+ const char *from,
+ int dir_fdt,
+ const char *to,
+ BtrfsSnapshotFlags flags,
+ copy_progress_path_t progress_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ _cleanup_free_ char *subvolume = NULL;
+ _cleanup_close_ int old_fd = -EBADF, new_fd = -EBADF, subvolume_fd = -EBADF;
+ int r;
+
+ assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
+ assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD);
+ assert(to);
+
+ old_fd = xopenat(dir_fdf, from, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY, /* xopen_flags = */ 0, /* mode = */ 0);
+ if (old_fd < 0)
+ return old_fd;
+
+ new_fd = chase_and_openat(dir_fdt, to, CHASE_PARENT|CHASE_EXTRACT_FILENAME, O_CLOEXEC, &subvolume);
+ if (new_fd < 0)
+ return new_fd;
+
+ r = btrfs_validate_subvolume_name(subvolume);
+ if (r < 0)
+ return r;
+
+ r = btrfs_is_subvol_at(dir_fdf, from);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ bool plain_directory = false;
+
+ /* If the source isn't a proper subvolume, fail unless fallback is requested */
+ if (!(flags & BTRFS_SNAPSHOT_FALLBACK_COPY))
+ return -EISDIR;
+
+ r = btrfs_subvol_make(new_fd, subvolume);
+ if (r < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(r) && (flags & BTRFS_SNAPSHOT_FALLBACK_DIRECTORY)) {
+ /* If the destination doesn't support subvolumes, then use a plain directory, if that's requested. */
+ if (mkdirat(new_fd, subvolume, 0755) < 0)
+ return -errno;
+
+ plain_directory = true;
+ } else
+ return r;
+ }
+
+ if (FLAGS_SET(flags, BTRFS_SNAPSHOT_LOCK_BSD)) {
+ subvolume_fd = xopenat_lock(new_fd, subvolume,
+ O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW,
+ /* xopen_flags = */ 0,
+ /* mode = */ 0,
+ LOCK_BSD,
+ LOCK_EX);
+ if (subvolume_fd < 0)
+ return subvolume_fd;
+
+ if (!plain_directory) {
+ r = btrfs_is_subvol_fd(subvolume_fd);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EEXIST;
+ }
+ }
+
+ r = copy_directory_at_full(
+ dir_fdf, from,
+ new_fd, subvolume,
+ COPY_MERGE_EMPTY|
+ COPY_REFLINK|
+ COPY_SAME_MOUNT|
+ COPY_HARDLINKS|
+ COPY_ALL_XATTRS|
+ (FLAGS_SET(flags, BTRFS_SNAPSHOT_SIGINT) ? COPY_SIGINT : 0)|
+ (FLAGS_SET(flags, BTRFS_SNAPSHOT_SIGTERM) ? COPY_SIGTERM : 0),
+ progress_path,
+ progress_bytes,
+ userdata);
+ if (r < 0)
+ goto fallback_fail;
+
+ if (flags & BTRFS_SNAPSHOT_READ_ONLY) {
+
+ if (plain_directory) {
+ /* Plain directories have no recursive read-only flag, but something pretty close to
+ * it: the IMMUTABLE bit. Let's use this here, if this is requested. */
+
+ if (flags & BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE)
+ (void) chattr_at(new_fd, subvolume, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL);
+ } else {
+ r = btrfs_subvol_set_read_only_at(new_fd, subvolume, true);
+ if (r < 0)
+ goto fallback_fail;
+ }
+ }
+
+ return flags & BTRFS_SNAPSHOT_LOCK_BSD ? TAKE_FD(subvolume_fd) : 0;
+
+ fallback_fail:
+ (void) rm_rf_at(new_fd, subvolume, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
+ return r;
+ }
+
+ return subvol_snapshot_children(old_fd, new_fd, subvolume, 0, flags);
+}
+
+int btrfs_qgroup_find_parents(int fd, uint64_t qgroupid, uint64_t **ret) {
+
+ struct btrfs_ioctl_search_args args = {
+ /* Tree of quota items */
+ .key.tree_id = BTRFS_QUOTA_TREE_OBJECTID,
+
+ /* Look precisely for the quota relation items */
+ .key.min_type = BTRFS_QGROUP_RELATION_KEY,
+ .key.max_type = BTRFS_QGROUP_RELATION_KEY,
+
+ /* No restrictions on the other components */
+ .key.min_offset = 0,
+ .key.max_offset = UINT64_MAX,
+
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ _cleanup_free_ uint64_t *items = NULL;
+ size_t n_items = 0;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (qgroupid == 0) {
+ r = btrfs_subvol_get_id_fd(fd, &qgroupid);
+ if (r < 0)
+ return r;
+ } else {
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+ }
+
+ args.key.min_objectid = args.key.max_objectid = qgroupid;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ _unused_ const void *body;
+
+ args.key.nr_items = 256;
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0) {
+ if (errno == ENOENT) /* quota tree missing: quota is disabled */
+ break;
+
+ return -errno;
+ }
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+
+ /* Make sure we start the next search at least from this entry */
+ btrfs_ioctl_search_args_set(&args, &sh);
+
+ if (sh.type != BTRFS_QGROUP_RELATION_KEY)
+ continue;
+ if (sh.offset < sh.objectid)
+ continue;
+ if (sh.objectid != qgroupid)
+ continue;
+
+ if (!GREEDY_REALLOC(items, n_items+1))
+ return -ENOMEM;
+
+ items[n_items++] = sh.offset;
+ }
+
+ /* Increase search key by one, to read the next item, if we can. */
+ if (!btrfs_ioctl_search_args_inc(&args))
+ break;
+ }
+
+ if (n_items <= 0) {
+ *ret = NULL;
+ return 0;
+ }
+
+ *ret = TAKE_PTR(items);
+
+ return (int) n_items;
+}
+
+int btrfs_subvol_auto_qgroup_fd(int fd, uint64_t subvol_id, bool insert_intermediary_qgroup) {
+ _cleanup_free_ uint64_t *qgroups = NULL;
+ _cleanup_close_ int real_fd = -EBADF;
+ uint64_t parent_subvol;
+ bool changed = false;
+ int n = 0, r;
+
+ assert(fd >= 0);
+
+ /*
+ * Sets up the specified subvolume's qgroup automatically in
+ * one of two ways:
+ *
+ * If insert_intermediary_qgroup is false, the subvolume's
+ * leaf qgroup will be assigned to the same parent qgroups as
+ * the subvolume's parent subvolume.
+ *
+ * If insert_intermediary_qgroup is true a new intermediary
+ * higher-level qgroup is created, with a higher level number,
+ * but reusing the id of the subvolume. The level number is
+ * picked as one smaller than the lowest level qgroup the
+ * parent subvolume is a member of. If the parent subvolume's
+ * leaf qgroup is assigned to no higher-level qgroup a new
+ * qgroup of level 255 is created instead. Either way, the new
+ * qgroup is then assigned to the parent's higher-level
+ * qgroup, and the subvolume itself is assigned to it.
+ *
+ * If the subvolume is already assigned to a higher level
+ * qgroup, no operation is executed.
+ *
+ * Effectively this means: regardless if
+ * insert_intermediary_qgroup is true or not, after this
+ * function is invoked the subvolume will be accounted within
+ * the same qgroups as the parent. However, if it is true, it
+ * will also get its own higher-level qgroup, which may in
+ * turn be used by subvolumes created beneath this subvolume
+ * later on.
+ *
+ * This hence defines a simple default qgroup setup for
+ * subvolumes, as long as this function is invoked on each
+ * created subvolume: each subvolume is always accounting
+ * together with its immediate parents. Optionally, if
+ * insert_intermediary_qgroup is true, it will also get a
+ * qgroup that then includes all its own child subvolumes.
+ */
+
+ /* Turn this into a proper fd, if it is currently O_PATH */
+ fd = fd_reopen_condition(fd, O_RDONLY|O_CLOEXEC, O_PATH, &real_fd);
+ if (fd < 0)
+ return fd;
+
+ if (subvol_id == 0) {
+ r = btrfs_is_subvol_fd(fd);
+ if (r < 0)
+ return r;
+ if (!r)
+ return -ENOTTY;
+
+ r = btrfs_subvol_get_id_fd(fd, &subvol_id);
+ if (r < 0)
+ return r;
+ }
+
+ n = btrfs_qgroup_find_parents(fd, subvol_id, &qgroups);
+ if (n < 0)
+ return n;
+ if (n > 0) /* already parent qgroups set up, let's bail */
+ return 0;
+
+ qgroups = mfree(qgroups);
+
+ r = btrfs_subvol_get_parent(fd, subvol_id, &parent_subvol);
+ if (r == -ENXIO)
+ /* No parent, hence no qgroup memberships */
+ n = 0;
+ else if (r < 0)
+ return r;
+ else {
+ n = btrfs_qgroup_find_parents(fd, parent_subvol, &qgroups);
+ if (n < 0)
+ return n;
+ }
+
+ if (insert_intermediary_qgroup) {
+ uint64_t lowest = 256, new_qgroupid;
+ bool created = false;
+
+ /* Determine the lowest qgroup that the parent
+ * subvolume is assigned to. */
+
+ for (int i = 0; i < n; i++) {
+ uint64_t level;
+
+ r = btrfs_qgroupid_split(qgroups[i], &level, NULL);
+ if (r < 0)
+ return r;
+
+ if (level < lowest)
+ lowest = level;
+ }
+
+ if (lowest <= 1) /* There are no levels left we could use insert an intermediary qgroup at */
+ return -EBUSY;
+
+ r = btrfs_qgroupid_make(lowest - 1, subvol_id, &new_qgroupid);
+ if (r < 0)
+ return r;
+
+ /* Create the new intermediary group, unless it already exists */
+ r = btrfs_qgroup_create(fd, new_qgroupid);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ if (r >= 0)
+ changed = created = true;
+
+ for (int i = 0; i < n; i++) {
+ r = btrfs_qgroup_assign(fd, new_qgroupid, qgroups[i]);
+ if (r < 0 && r != -EEXIST) {
+ if (created)
+ (void) btrfs_qgroup_destroy_recursive(fd, new_qgroupid);
+
+ return r;
+ }
+ if (r >= 0)
+ changed = true;
+ }
+
+ r = btrfs_qgroup_assign(fd, subvol_id, new_qgroupid);
+ if (r < 0 && r != -EEXIST) {
+ if (created)
+ (void) btrfs_qgroup_destroy_recursive(fd, new_qgroupid);
+ return r;
+ }
+ if (r >= 0)
+ changed = true;
+
+ } else {
+ int i;
+
+ /* Assign our subvolume to all the same qgroups as the parent */
+
+ for (i = 0; i < n; i++) {
+ r = btrfs_qgroup_assign(fd, subvol_id, qgroups[i]);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ if (r >= 0)
+ changed = true;
+ }
+ }
+
+ return changed;
+}
+
+int btrfs_subvol_auto_qgroup(const char *path, uint64_t subvol_id, bool create_intermediary_qgroup) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = open(path, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY);
+ if (fd < 0)
+ return -errno;
+
+ return btrfs_subvol_auto_qgroup_fd(fd, subvol_id, create_intermediary_qgroup);
+}
+
+int btrfs_subvol_get_parent(int fd, uint64_t subvol_id, uint64_t *ret) {
+
+ struct btrfs_ioctl_search_args args = {
+ /* Tree of tree roots */
+ .key.tree_id = BTRFS_ROOT_TREE_OBJECTID,
+
+ /* Look precisely for the subvolume items */
+ .key.min_type = BTRFS_ROOT_BACKREF_KEY,
+ .key.max_type = BTRFS_ROOT_BACKREF_KEY,
+
+ /* No restrictions on the other components */
+ .key.min_offset = 0,
+ .key.max_offset = UINT64_MAX,
+
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (subvol_id == 0) {
+ r = btrfs_subvol_get_id_fd(fd, &subvol_id);
+ if (r < 0)
+ return r;
+ } else {
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+ }
+
+ args.key.min_objectid = args.key.max_objectid = subvol_id;
+
+ while (btrfs_ioctl_search_args_compare(&args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ _unused_ const void *body = NULL;
+
+ args.key.nr_items = 256;
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &args) < 0)
+ return negative_errno();
+
+ if (args.key.nr_items <= 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, args) {
+
+ if (sh.type != BTRFS_ROOT_BACKREF_KEY)
+ continue;
+ if (sh.objectid != subvol_id)
+ continue;
+
+ *ret = sh.offset;
+ return 0;
+ }
+ }
+
+ return -ENXIO;
+}
+
+int btrfs_forget_device(const char *path) {
+ _cleanup_close_ int control_fd = -EBADF;
+ struct btrfs_ioctl_vol_args args = {};
+
+ assert(path);
+
+ if (strlen(path) > BTRFS_PATH_NAME_MAX)
+ return -E2BIG;
+
+ strcpy(args.name, path);
+
+ control_fd = open("/dev/btrfs-control", O_RDWR|O_CLOEXEC);
+ if (control_fd < 0)
+ return -errno;
+
+ return RET_NERRNO(ioctl(control_fd, BTRFS_IOC_FORGET_DEV, &args));
+}
+
+typedef struct BtrfsStripe {
+ uint64_t devid;
+ uint64_t offset;
+} BtrfsStripe;
+
+typedef struct BtrfsChunk {
+ uint64_t offset;
+ uint64_t length;
+ uint64_t type;
+
+ BtrfsStripe *stripes;
+ uint16_t n_stripes;
+ uint64_t stripe_len;
+} BtrfsChunk;
+
+typedef struct BtrfsChunkTree {
+ BtrfsChunk **chunks;
+ size_t n_chunks;
+} BtrfsChunkTree;
+
+static BtrfsChunk* btrfs_chunk_free(BtrfsChunk *chunk) {
+ if (!chunk)
+ return NULL;
+
+ free(chunk->stripes);
+
+ return mfree(chunk);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(BtrfsChunk*, btrfs_chunk_free);
+
+static void btrfs_chunk_tree_done(BtrfsChunkTree *tree) {
+ assert(tree);
+
+ FOREACH_ARRAY(i, tree->chunks, tree->n_chunks)
+ btrfs_chunk_free(*i);
+
+ free(tree->chunks);
+}
+
+static int btrfs_read_chunk_tree_fd(int fd, BtrfsChunkTree *ret) {
+
+ struct btrfs_ioctl_search_args search_args = {
+ .key.tree_id = BTRFS_CHUNK_TREE_OBJECTID,
+
+ .key.min_type = BTRFS_CHUNK_ITEM_KEY,
+ .key.max_type = BTRFS_CHUNK_ITEM_KEY,
+
+ .key.min_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+ .key.max_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+
+ .key.min_offset = 0,
+ .key.max_offset = UINT64_MAX,
+
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ _cleanup_(btrfs_chunk_tree_done) BtrfsChunkTree tree = {};
+
+ assert(fd >= 0);
+ assert(ret);
+
+ while (btrfs_ioctl_search_args_compare(&search_args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ search_args.key.nr_items = 256;
+
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &search_args) < 0)
+ return -errno;
+
+ if (search_args.key.nr_items == 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, search_args) {
+ _cleanup_(btrfs_chunk_freep) BtrfsChunk *chunk = NULL;
+
+ btrfs_ioctl_search_args_set(&search_args, &sh);
+
+ if (sh.objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)
+ continue;
+ if (sh.type != BTRFS_CHUNK_ITEM_KEY)
+ continue;
+
+ chunk = new(BtrfsChunk, 1);
+ if (!chunk)
+ return -ENOMEM;
+
+ const struct btrfs_chunk *item = body;
+ *chunk = (BtrfsChunk) {
+ .offset = sh.offset,
+ .length = le64toh(item->length),
+ .type = le64toh(item->type),
+ .n_stripes = le16toh(item->num_stripes),
+ .stripe_len = le64toh(item->stripe_len),
+ };
+
+ chunk->stripes = new(BtrfsStripe, chunk->n_stripes);
+ if (!chunk->stripes)
+ return -ENOMEM;
+
+ for (size_t j = 0; j < chunk->n_stripes; j++) {
+ const struct btrfs_stripe *stripe = &item->stripe + j;
+
+ chunk->stripes[j] = (BtrfsStripe) {
+ .devid = le64toh(stripe->devid),
+ .offset = le64toh(stripe->offset),
+ };
+ }
+
+ if (!GREEDY_REALLOC(tree.chunks, tree.n_chunks + 1))
+ return -ENOMEM;
+
+ tree.chunks[tree.n_chunks++] = TAKE_PTR(chunk);
+ }
+
+ if (!btrfs_ioctl_search_args_inc(&search_args))
+ break;
+ }
+
+ *ret = TAKE_STRUCT(tree);
+ return 0;
+}
+
+static BtrfsChunk* btrfs_find_chunk_from_logical_address(const BtrfsChunkTree *tree, uint64_t logical) {
+ size_t min_index, max_index;
+
+ assert(tree);
+ assert(tree->chunks || tree->n_chunks == 0);
+
+ if (tree->n_chunks == 0)
+ return NULL;
+
+ /* bisection */
+ min_index = 0;
+ max_index = tree->n_chunks - 1;
+
+ while (min_index <= max_index) {
+ size_t mid = (min_index + max_index) / 2;
+
+ if (logical < tree->chunks[mid]->offset) {
+ if (mid < 1)
+ return NULL;
+
+ max_index = mid - 1;
+ } else if (logical >= tree->chunks[mid]->offset + tree->chunks[mid]->length)
+ min_index = mid + 1;
+ else
+ return tree->chunks[mid];
+ }
+
+ return NULL;
+}
+
+static int btrfs_is_nocow_fd(int fd) {
+ unsigned flags;
+ int r;
+
+ assert(fd >= 0);
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -ENOTTY;
+
+ r = read_attr_fd(fd, &flags);
+ if (r < 0)
+ return r;
+
+ return FLAGS_SET(flags, FS_NOCOW_FL) && !FLAGS_SET(flags, FS_COMPR_FL);
+}
+
+int btrfs_get_file_physical_offset_fd(int fd, uint64_t *ret) {
+
+ struct btrfs_ioctl_search_args search_args = {
+ .key.min_type = BTRFS_EXTENT_DATA_KEY,
+ .key.max_type = BTRFS_EXTENT_DATA_KEY,
+
+ .key.min_offset = 0,
+ .key.max_offset = UINT64_MAX,
+
+ .key.min_transid = 0,
+ .key.max_transid = UINT64_MAX,
+ };
+
+ _cleanup_(btrfs_chunk_tree_done) BtrfsChunkTree tree = {};
+ uint64_t subvol_id;
+ struct stat st;
+ int r;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return r;
+
+ r = btrfs_is_nocow_fd(fd);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot get physical address for btrfs extent: CoW enabled");
+
+ r = btrfs_subvol_get_id_fd(fd, &subvol_id);
+ if (r < 0)
+ return r;
+
+ r = btrfs_read_chunk_tree_fd(fd, &tree);
+ if (r < 0)
+ return r;
+
+ search_args.key.tree_id = subvol_id;
+ search_args.key.min_objectid = search_args.key.max_objectid = st.st_ino;
+
+ while (btrfs_ioctl_search_args_compare(&search_args) <= 0) {
+ struct btrfs_ioctl_search_header sh;
+ const void *body;
+
+ search_args.key.nr_items = 256;
+
+ if (ioctl(fd, BTRFS_IOC_TREE_SEARCH, &search_args) < 0)
+ return -errno;
+
+ if (search_args.key.nr_items == 0)
+ break;
+
+ FOREACH_BTRFS_IOCTL_SEARCH_HEADER(sh, body, search_args) {
+ uint64_t logical_offset;
+ BtrfsChunk *chunk;
+
+ btrfs_ioctl_search_args_set(&search_args, &sh);
+
+ if (sh.type != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ if (sh.objectid != st.st_ino)
+ continue;
+
+ const struct btrfs_file_extent_item *item = body;
+ if (!IN_SET(item->type, BTRFS_FILE_EXTENT_REG, BTRFS_FILE_EXTENT_PREALLOC))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot get physical address for btrfs extent: invalid type %" PRIu8,
+ item->type);
+
+ if (item->compression != 0 || item->encryption != 0 || item->other_encoding != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot get physical address for btrfs extent: has incompatible property");
+
+ logical_offset = le64toh(item->disk_bytenr);
+ if (logical_offset == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot get physical address for btrfs extent: failed to get logical offset");
+
+ chunk = btrfs_find_chunk_from_logical_address(&tree, logical_offset);
+ if (!chunk)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot get physical address for btrfs extent: no matching chunk found");
+
+ if ((chunk->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot get physical address for btrfs extent: unsupported profile");
+
+ uint64_t relative_chunk, relative_stripe, stripe_nr;
+ uint16_t stripe_index;
+
+ assert(logical_offset >= chunk->offset);
+ assert(chunk->n_stripes > 0);
+ assert(chunk->stripe_len > 0);
+
+ relative_chunk = logical_offset - chunk->offset;
+ stripe_nr = relative_chunk / chunk->stripe_len;
+ relative_stripe = relative_chunk - stripe_nr * chunk->stripe_len;
+ stripe_index = stripe_nr % chunk->n_stripes;
+
+ *ret = chunk->stripes[stripe_index].offset +
+ stripe_nr / chunk->n_stripes * chunk->stripe_len +
+ relative_stripe;
+
+ return 0;
+ }
+
+ if (!btrfs_ioctl_search_args_inc(&search_args))
+ break;
+ }
+
+ return -ENODATA;
+}
diff --git a/src/shared/btrfs-util.h b/src/shared/btrfs-util.h
new file mode 100644
index 0000000..cd80903
--- /dev/null
+++ b/src/shared/btrfs-util.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "sd-id128.h"
+
+#include "btrfs.h"
+#include "copy.h"
+#include "time-util.h"
+
+typedef struct BtrfsSubvolInfo {
+ uint64_t subvol_id;
+ usec_t otime;
+
+ sd_id128_t uuid;
+ sd_id128_t parent_uuid;
+
+ bool read_only;
+} BtrfsSubvolInfo;
+
+typedef struct BtrfsQuotaInfo {
+ uint64_t referenced;
+ uint64_t exclusive;
+ uint64_t referenced_max;
+ uint64_t exclusive_max;
+} BtrfsQuotaInfo;
+
+typedef enum BtrfsSnapshotFlags {
+ BTRFS_SNAPSHOT_FALLBACK_COPY = 1 << 0, /* If the source isn't a subvolume, reflink everything */
+ BTRFS_SNAPSHOT_READ_ONLY = 1 << 1,
+ BTRFS_SNAPSHOT_RECURSIVE = 1 << 2,
+ BTRFS_SNAPSHOT_QUOTA = 1 << 3,
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY = 1 << 4, /* If the destination doesn't support subvolumes, reflink/copy instead */
+ BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE = 1 << 5, /* When we can't create a subvolume, use the FS_IMMUTABLE attribute for indicating read-only */
+ BTRFS_SNAPSHOT_SIGINT = 1 << 6, /* Check for SIGINT regularly, and return EINTR if seen */
+ BTRFS_SNAPSHOT_SIGTERM = 1 << 7, /* Ditto, but for SIGTERM */
+ BTRFS_SNAPSHOT_LOCK_BSD = 1 << 8, /* Return a BSD exclusively locked file descriptor referring to snapshot subvolume/directory. */
+} BtrfsSnapshotFlags;
+
+typedef enum BtrfsRemoveFlags {
+ BTRFS_REMOVE_RECURSIVE = 1 << 0,
+ BTRFS_REMOVE_QUOTA = 1 << 1,
+} BtrfsRemoveFlags;
+
+int btrfs_is_subvol_at(int dir_fd, const char *path);
+static inline int btrfs_is_subvol_fd(int fd) {
+ return btrfs_is_subvol_at(fd, NULL);
+}
+static inline int btrfs_is_subvol(const char *path) {
+ return btrfs_is_subvol_at(AT_FDCWD, path);
+}
+
+int btrfs_get_block_device_at(int dir_fd, const char *path, dev_t *ret);
+static inline int btrfs_get_block_device(const char *path, dev_t *ret) {
+ return btrfs_get_block_device_at(AT_FDCWD, path, ret);
+}
+static inline int btrfs_get_block_device_fd(int fd, dev_t *ret) {
+ return btrfs_get_block_device_at(fd, "", ret);
+}
+
+int btrfs_defrag_fd(int fd);
+int btrfs_defrag(const char *p);
+
+int btrfs_quota_enable_fd(int fd, bool b);
+int btrfs_quota_enable(const char *path, bool b);
+
+int btrfs_quota_scan_start(int fd);
+int btrfs_quota_scan_wait(int fd);
+int btrfs_quota_scan_ongoing(int fd);
+
+int btrfs_subvol_snapshot_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, BtrfsSnapshotFlags flags, copy_progress_path_t progress_path, copy_progress_bytes_t progress_bytes, void *userdata);
+static inline int btrfs_subvol_snapshot_at(int dir_fdf, const char *from, int dir_fdt, const char *to, BtrfsSnapshotFlags flags) {
+ return btrfs_subvol_snapshot_at_full(dir_fdf, from, dir_fdt, to, flags, NULL, NULL, NULL);
+}
+
+int btrfs_subvol_remove_at(int dir_fd, const char *path, BtrfsRemoveFlags flags);
+static inline int btrfs_subvol_remove(const char *path, BtrfsRemoveFlags flags) {
+ return btrfs_subvol_remove_at(AT_FDCWD, path, flags);
+}
+
+int btrfs_subvol_set_read_only_at(int dir_fd, const char *path, bool b);
+static inline int btrfs_subvol_set_read_only_fd(int fd, bool b) {
+ return btrfs_subvol_set_read_only_at(fd, NULL, b);
+}
+static inline int btrfs_subvol_set_read_only(const char *path, bool b) {
+ return btrfs_subvol_set_read_only_at(AT_FDCWD, path, b);
+}
+
+int btrfs_subvol_get_read_only_fd(int fd);
+
+int btrfs_subvol_get_id(int fd, const char *subvolume, uint64_t *ret);
+int btrfs_subvol_get_id_fd(int fd, uint64_t *ret);
+int btrfs_subvol_get_parent(int fd, uint64_t subvol_id, uint64_t *ret);
+
+int btrfs_subvol_get_info_fd(int fd, uint64_t subvol_id, BtrfsSubvolInfo *info);
+
+int btrfs_subvol_find_subtree_qgroup(int fd, uint64_t subvol_id, uint64_t *ret);
+
+int btrfs_subvol_get_subtree_quota(const char *path, uint64_t subvol_id, BtrfsQuotaInfo *quota);
+int btrfs_subvol_get_subtree_quota_fd(int fd, uint64_t subvol_id, BtrfsQuotaInfo *quota);
+
+int btrfs_subvol_set_subtree_quota_limit(const char *path, uint64_t subvol_id, uint64_t referenced_max);
+int btrfs_subvol_set_subtree_quota_limit_fd(int fd, uint64_t subvol_id, uint64_t referenced_max);
+
+int btrfs_subvol_auto_qgroup_fd(int fd, uint64_t subvol_id, bool new_qgroup);
+int btrfs_subvol_auto_qgroup(const char *path, uint64_t subvol_id, bool create_intermediary_qgroup);
+
+int btrfs_qgroupid_make(uint64_t level, uint64_t id, uint64_t *ret);
+int btrfs_qgroupid_split(uint64_t qgroupid, uint64_t *level, uint64_t *id);
+
+int btrfs_qgroup_create(int fd, uint64_t qgroupid);
+int btrfs_qgroup_destroy(int fd, uint64_t qgroupid);
+int btrfs_qgroup_destroy_recursive(int fd, uint64_t qgroupid);
+
+int btrfs_qgroup_set_limit_fd(int fd, uint64_t qgroupid, uint64_t referenced_max);
+int btrfs_qgroup_set_limit(const char *path, uint64_t qgroupid, uint64_t referenced_max);
+
+int btrfs_qgroup_copy_limits(int fd, uint64_t old_qgroupid, uint64_t new_qgroupid);
+
+int btrfs_qgroup_assign(int fd, uint64_t child, uint64_t parent);
+int btrfs_qgroup_unassign(int fd, uint64_t child, uint64_t parent);
+
+int btrfs_qgroup_find_parents(int fd, uint64_t qgroupid, uint64_t **ret);
+
+int btrfs_qgroup_get_quota_fd(int fd, uint64_t qgroupid, BtrfsQuotaInfo *quota);
+int btrfs_qgroup_get_quota(const char *path, uint64_t qgroupid, BtrfsQuotaInfo *quota);
+
+static inline int btrfs_log_dev_root(int level, int ret, const char *p) {
+ return log_full_errno(level, ret,
+ "File system behind %s is reported by btrfs to be backed by pseudo-device /dev/root, which is not a valid userspace accessible device node. "
+ "Cannot determine correct backing block device.", p);
+}
+
+static inline bool btrfs_might_be_subvol(const struct stat *st) {
+ if (!st)
+ return false;
+
+ /* Returns true if this 'struct stat' looks like it could refer to a btrfs subvolume. To make a final
+ * decision, needs to be combined with an fstatfs() check to see if this is actually btrfs. */
+
+ return S_ISDIR(st->st_mode) && st->st_ino == 256;
+}
+
+int btrfs_forget_device(const char *path);
+
+int btrfs_get_file_physical_offset_fd(int fd, uint64_t *ret);
diff --git a/src/shared/bus-get-properties.c b/src/shared/bus-get-properties.c
new file mode 100644
index 0000000..53e5d6b
--- /dev/null
+++ b/src/shared/bus-get-properties.c
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-get-properties.h"
+#include "rlimit-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+int bus_property_get_bool(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ int b = *(bool*) userdata;
+
+ return sd_bus_message_append_basic(reply, 'b', &b);
+}
+
+int bus_property_set_bool(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ int b, r;
+
+ r = sd_bus_message_read(value, "b", &b);
+ if (r < 0)
+ return r;
+
+ *(bool*) userdata = b;
+ return 0;
+}
+
+int bus_property_get_tristate(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ /* Defaults to false. */
+
+ int b = (*(int*) userdata) > 0;
+
+ return sd_bus_message_append_basic(reply, 'b', &b);
+}
+
+int bus_property_get_id128(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ sd_id128_t *id = userdata;
+
+ if (sd_id128_is_null(*id)) /* Add an empty array if the ID is zero */
+ return sd_bus_message_append(reply, "ay", 0);
+ else
+ return sd_bus_message_append_array(reply, 'y', id->bytes, 16);
+}
+
+#if __SIZEOF_SIZE_T__ != 8
+int bus_property_get_size(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t sz = *(size_t*) userdata;
+
+ return sd_bus_message_append_basic(reply, 't', &sz);
+}
+#endif
+
+#if __SIZEOF_LONG__ != 8
+int bus_property_get_long(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ int64_t l = *(long*) userdata;
+
+ return sd_bus_message_append_basic(reply, 'x', &l);
+}
+
+int bus_property_get_ulong(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ uint64_t ul = *(unsigned long*) userdata;
+
+ return sd_bus_message_append_basic(reply, 't', &ul);
+}
+#endif
+
+int bus_property_get_rlimit(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ const char *is_soft;
+ struct rlimit *rl;
+ uint64_t u;
+ rlim_t x;
+
+ assert(bus);
+ assert(reply);
+ assert(userdata);
+
+ is_soft = endswith(property, "Soft");
+
+ rl = *(struct rlimit**) userdata;
+ if (rl)
+ x = is_soft ? rl->rlim_cur : rl->rlim_max;
+ else {
+ struct rlimit buf = {};
+ const char *s, *p;
+ int z;
+
+ /* Chop off "Soft" suffix */
+ s = is_soft ? strndupa_safe(property, is_soft - property) : property;
+
+ /* Skip over any prefix, such as "Default" */
+ assert_se(p = strstrafter(s, "Limit"));
+
+ z = rlimit_from_string(p);
+ assert(z >= 0);
+
+ (void) getrlimit(z, &buf);
+ x = is_soft ? buf.rlim_cur : buf.rlim_max;
+ }
+
+ /* rlim_t might have different sizes, let's map RLIMIT_INFINITY to UINT64_MAX, so that it is the same on all
+ * archs */
+ u = x == RLIM_INFINITY ? UINT64_MAX : (uint64_t) x;
+
+ return sd_bus_message_append(reply, "t", u);
+}
diff --git a/src/shared/bus-get-properties.h b/src/shared/bus-get-properties.h
new file mode 100644
index 0000000..4c35126
--- /dev/null
+++ b/src/shared/bus-get-properties.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "macro.h"
+
+int bus_property_get_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_set_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error);
+int bus_property_get_tristate(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_get_id128(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+
+#define bus_property_get_usec ((sd_bus_property_get_t) NULL)
+#define bus_property_set_usec ((sd_bus_property_set_t) NULL)
+
+assert_cc(sizeof(int) == sizeof(int32_t));
+#define bus_property_get_int ((sd_bus_property_get_t) NULL)
+
+assert_cc(sizeof(unsigned) == sizeof(uint32_t));
+#define bus_property_get_unsigned ((sd_bus_property_get_t) NULL)
+
+/* On 64-bit machines we can use the default serializer for size_t and
+ * friends, otherwise we need to cast this manually */
+#if __SIZEOF_SIZE_T__ == 8
+#define bus_property_get_size ((sd_bus_property_get_t) NULL)
+#else
+int bus_property_get_size(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+#endif
+
+#if __SIZEOF_LONG__ == 8
+#define bus_property_get_long ((sd_bus_property_get_t) NULL)
+#define bus_property_get_ulong ((sd_bus_property_get_t) NULL)
+#else
+int bus_property_get_long(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_get_ulong(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+#endif
+
+/* uid_t and friends on Linux 32 bit. This means we can just use the
+ * default serializer for 32-bit unsigned, for serializing it, and map
+ * it to NULL here */
+assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+#define bus_property_get_uid ((sd_bus_property_get_t) NULL)
+
+assert_cc(sizeof(gid_t) == sizeof(uint32_t));
+#define bus_property_get_gid ((sd_bus_property_get_t) NULL)
+
+assert_cc(sizeof(pid_t) == sizeof(uint32_t));
+#define bus_property_get_pid ((sd_bus_property_get_t) NULL)
+
+assert_cc(sizeof(mode_t) == sizeof(uint32_t));
+#define bus_property_get_mode ((sd_bus_property_get_t) NULL)
+
+int bus_property_get_rlimit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+
+#define BUS_DEFINE_PROPERTY_GET_GLOBAL(function, bus_type, val) \
+ int function(sd_bus *bus, \
+ const char *path, \
+ const char *interface, \
+ const char *property, \
+ sd_bus_message *reply, \
+ void *userdata, \
+ sd_bus_error *error) { \
+ \
+ assert(bus); \
+ assert(reply); \
+ \
+ return sd_bus_message_append(reply, bus_type, val); \
+ }
+
+#define BUS_DEFINE_PROPERTY_GET2(function, bus_type, data_type, get1, get2) \
+ int function(sd_bus *bus, \
+ const char *path, \
+ const char *interface, \
+ const char *property, \
+ sd_bus_message *reply, \
+ void *userdata, \
+ sd_bus_error *error) { \
+ \
+ data_type *data = ASSERT_PTR(userdata); \
+ \
+ assert(bus); \
+ assert(reply); \
+ \
+ return sd_bus_message_append(reply, bus_type, \
+ get2(get1(data))); \
+ }
+
+#define ident(x) (x)
+#define BUS_DEFINE_PROPERTY_GET(function, bus_type, data_type, get1) \
+ BUS_DEFINE_PROPERTY_GET2(function, bus_type, data_type, get1, ident)
+
+#define ref(x) (*(x))
+#define BUS_DEFINE_PROPERTY_GET_REF(function, bus_type, data_type, get) \
+ BUS_DEFINE_PROPERTY_GET2(function, bus_type, data_type, ref, get)
+
+#define BUS_DEFINE_PROPERTY_GET_ENUM(function, name, type) \
+ BUS_DEFINE_PROPERTY_GET_REF(function, "s", type, name##_to_string)
+
+#define BUS_PROPERTY_DUAL_TIMESTAMP(name, offset, flags) \
+ SD_BUS_PROPERTY(name, "t", bus_property_get_usec, (offset) + offsetof(struct dual_timestamp, realtime), (flags)), \
+ SD_BUS_PROPERTY(name "Monotonic", "t", bus_property_get_usec, (offset) + offsetof(struct dual_timestamp, monotonic), (flags))
diff --git a/src/shared/bus-locator.c b/src/shared/bus-locator.c
new file mode 100644
index 0000000..ff7a872
--- /dev/null
+++ b/src/shared/bus-locator.c
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-locator.h"
+#include "macro.h"
+
+const BusLocator* const bus_home_mgr = &(BusLocator){
+ .destination = "org.freedesktop.home1",
+ .path = "/org/freedesktop/home1",
+ .interface = "org.freedesktop.home1.Manager",
+};
+
+const BusLocator* const bus_import_mgr = &(BusLocator){
+ .destination ="org.freedesktop.import1",
+ .path = "/org/freedesktop/import1",
+ .interface = "org.freedesktop.import1.Manager"
+};
+
+const BusLocator* const bus_locale = &(BusLocator){
+ .destination = "org.freedesktop.locale1",
+ .path = "/org/freedesktop/locale1",
+ .interface = "org.freedesktop.locale1"
+};
+
+const BusLocator* const bus_login_mgr = &(BusLocator){
+ .destination = "org.freedesktop.login1",
+ .path = "/org/freedesktop/login1",
+ .interface = "org.freedesktop.login1.Manager"
+};
+
+const BusLocator* const bus_machine_mgr = &(BusLocator){
+ .destination ="org.freedesktop.machine1",
+ .path = "/org/freedesktop/machine1",
+ .interface = "org.freedesktop.machine1.Manager"
+};
+
+const BusLocator* const bus_network_mgr = &(BusLocator){
+ .destination = "org.freedesktop.network1",
+ .path = "/org/freedesktop/network1",
+ .interface = "org.freedesktop.network1.Manager"
+};
+
+const BusLocator* const bus_oom_mgr = &(BusLocator){
+ .destination = "org.freedesktop.oom1",
+ .path = "/org/freedesktop/oom1",
+ .interface = "org.freedesktop.oom1.Manager"
+};
+
+const BusLocator* const bus_portable_mgr = &(BusLocator){
+ .destination = "org.freedesktop.portable1",
+ .path = "/org/freedesktop/portable1",
+ .interface = "org.freedesktop.portable1.Manager"
+};
+
+const BusLocator* const bus_resolve_mgr = &(BusLocator){
+ .destination = "org.freedesktop.resolve1",
+ .path = "/org/freedesktop/resolve1",
+ .interface = "org.freedesktop.resolve1.Manager"
+};
+
+const BusLocator* const bus_systemd_mgr = &(BusLocator){
+ .destination = "org.freedesktop.systemd1",
+ .path = "/org/freedesktop/systemd1",
+ .interface = "org.freedesktop.systemd1.Manager"
+};
+
+const BusLocator* const bus_timedate = &(BusLocator){
+ .destination = "org.freedesktop.timedate1",
+ .path = "/org/freedesktop/timedate1",
+ .interface = "org.freedesktop.timedate1"
+};
+
+const BusLocator* const bus_timesync_mgr = &(BusLocator){
+ .destination = "org.freedesktop.timesync1",
+ .path = "/org/freedesktop/timesync1",
+ .interface = "org.freedesktop.timesync1.Manager"
+};
+
+const BusLocator* const bus_hostname = &(BusLocator){
+ .destination = "org.freedesktop.hostname1",
+ .path = "/org/freedesktop/hostname1",
+ .interface = "org.freedesktop.hostname1"
+};
+
+/* Shorthand flavors of the sd-bus convenience helpers with destination,path,interface strings encapsulated
+ * within a single struct. */
+int bus_call_method_async(
+ sd_bus *bus,
+ sd_bus_slot **slot,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_message_handler_t callback,
+ void *userdata,
+ const char *types, ...) {
+
+ va_list ap;
+ int r;
+
+ assert(locator);
+
+ va_start(ap, types);
+ r = sd_bus_call_method_asyncv(bus, slot, locator->destination, locator->path, locator->interface, member, callback, userdata, types, ap);
+ va_end(ap);
+
+ return r;
+}
+
+int bus_call_method(
+ sd_bus *bus,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_error *error,
+ sd_bus_message **reply,
+ const char *types, ...) {
+
+ va_list ap;
+ int r;
+
+ assert(locator);
+
+ va_start(ap, types);
+ r = sd_bus_call_methodv(bus, locator->destination, locator->path, locator->interface, member, error, reply, types, ap);
+ va_end(ap);
+
+ return r;
+}
+
+int bus_get_property(
+ sd_bus *bus,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_error *error,
+ sd_bus_message **reply,
+ const char *type) {
+
+ assert(locator);
+
+ return sd_bus_get_property(bus, locator->destination, locator->path, locator->interface, member, error, reply, type);
+}
+
+int bus_get_property_trivial(
+ sd_bus *bus,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_error *error,
+ char type, void *ptr) {
+
+ assert(locator);
+
+ return sd_bus_get_property_trivial(bus, locator->destination, locator->path, locator->interface, member, error, type, ptr);
+}
+
+int bus_get_property_string(
+ sd_bus *bus,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_error *error,
+ char **ret) {
+
+ assert(locator);
+
+ return sd_bus_get_property_string(bus, locator->destination, locator->path, locator->interface, member, error, ret);
+}
+
+int bus_get_property_strv(
+ sd_bus *bus,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_error *error,
+ char ***ret) {
+
+ assert(locator);
+
+ return sd_bus_get_property_strv(bus, locator->destination, locator->path, locator->interface, member, error, ret);
+}
+
+int bus_set_property(
+ sd_bus *bus,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_error *error,
+ const char *type, ...) {
+
+ va_list ap;
+ int r;
+
+ assert(locator);
+
+ va_start(ap, type);
+ r = sd_bus_set_propertyv(bus, locator->destination, locator->path, locator->interface, member, error, type, ap);
+ va_end(ap);
+
+ return r;
+}
+
+int bus_match_signal(
+ sd_bus *bus,
+ sd_bus_slot **ret,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_message_handler_t callback,
+ void *userdata) {
+
+ assert(locator);
+
+ return sd_bus_match_signal(bus, ret, locator->destination, locator->path, locator->interface, member, callback, userdata);
+}
+
+int bus_match_signal_async(
+ sd_bus *bus,
+ sd_bus_slot **ret,
+ const BusLocator *locator,
+ const char *member,
+ sd_bus_message_handler_t callback,
+ sd_bus_message_handler_t install_callback,
+ void *userdata) {
+
+ assert(locator);
+
+ return sd_bus_match_signal_async(bus, ret, locator->destination, locator->path, locator->interface, member, callback, install_callback, userdata);
+}
+
+int bus_message_new_method_call(
+ sd_bus *bus,
+ sd_bus_message **m,
+ const BusLocator *locator,
+ const char *member) {
+
+ assert(locator);
+
+ return sd_bus_message_new_method_call(bus, m, locator->destination, locator->path, locator->interface, member);
+}
diff --git a/src/shared/bus-locator.h b/src/shared/bus-locator.h
new file mode 100644
index 0000000..4f50a97
--- /dev/null
+++ b/src/shared/bus-locator.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+typedef struct BusLocator {
+ const char *destination;
+ const char *path;
+ const char *interface;
+} BusLocator;
+
+extern const BusLocator* const bus_home_mgr;
+extern const BusLocator* const bus_hostname;
+extern const BusLocator* const bus_import_mgr;
+extern const BusLocator* const bus_locale;
+extern const BusLocator* const bus_login_mgr;
+extern const BusLocator* const bus_machine_mgr;
+extern const BusLocator* const bus_network_mgr;
+extern const BusLocator* const bus_oom_mgr;
+extern const BusLocator* const bus_portable_mgr;
+extern const BusLocator* const bus_resolve_mgr;
+extern const BusLocator* const bus_systemd_mgr;
+extern const BusLocator* const bus_timedate;
+extern const BusLocator* const bus_timesync_mgr;
+
+/* Shorthand flavors of the sd-bus convenience helpers with destination,path,interface strings encapsulated
+ * within a single struct. */
+int bus_call_method_async(sd_bus *bus, sd_bus_slot **slot, const BusLocator *locator, const char *member, sd_bus_message_handler_t callback, void *userdata, const char *types, ...);
+int bus_call_method(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, sd_bus_message **reply, const char *types, ...);
+int bus_get_property(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, sd_bus_message **reply, const char *type);
+int bus_get_property_trivial(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, char type, void *ptr);
+int bus_get_property_string(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, char **ret);
+int bus_get_property_strv(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, char ***ret);
+int bus_set_property(sd_bus *bus, const BusLocator *locator, const char *member, sd_bus_error *error, const char *type, ...);
+int bus_match_signal(sd_bus *bus, sd_bus_slot **ret, const BusLocator *locator, const char *member, sd_bus_message_handler_t callback, void *userdata);
+int bus_match_signal_async(sd_bus *bus, sd_bus_slot **ret, const BusLocator *locator, const char *member, sd_bus_message_handler_t callback, sd_bus_message_handler_t install_callback, void *userdata);
+int bus_message_new_method_call(sd_bus *bus, sd_bus_message **m, const BusLocator *locator, const char *member);
diff --git a/src/shared/bus-log-control-api.c b/src/shared/bus-log-control-api.c
new file mode 100644
index 0000000..40f99ac
--- /dev/null
+++ b/src/shared/bus-log-control-api.c
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-get-properties.h"
+#include "bus-log-control-api.h"
+#include "bus-util.h"
+#include "log.h"
+#include "sd-bus.h"
+#include "syslog-util.h"
+
+int bus_property_get_log_level(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ _cleanup_free_ char *t = NULL;
+ int r;
+
+ assert(bus);
+ assert(reply);
+
+ r = log_level_to_string_alloc(log_get_max_level(), &t);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_append(reply, "s", t);
+}
+
+int bus_property_set_log_level(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(value);
+
+ r = sd_bus_message_read(value, "s", &t);
+ if (r < 0)
+ return r;
+
+ r = log_level_from_string(t);
+ if (r < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log level '%s'", t);
+
+ log_info("Setting log level to %s.", t);
+ log_set_max_level(r);
+
+ return 0;
+}
+
+BUS_DEFINE_PROPERTY_GET_GLOBAL(bus_property_get_log_target, "s", log_target_to_string(log_get_target()));
+
+int bus_property_set_log_target(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *value,
+ void *userdata,
+ sd_bus_error *error) {
+
+ LogTarget target;
+ const char *t;
+ int r;
+
+ assert(bus);
+ assert(value);
+
+ r = sd_bus_message_read(value, "s", &t);
+ if (r < 0)
+ return r;
+
+ target = log_target_from_string(t);
+ if (target < 0)
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid log target '%s'", t);
+
+ log_info("Setting log target to %s.", log_target_to_string(target));
+ log_set_target_and_open(target);
+
+ return 0;
+}
+
+BUS_DEFINE_PROPERTY_GET_GLOBAL(bus_property_get_syslog_identifier, "s", program_invocation_short_name);
+
+static const sd_bus_vtable log_control_vtable[] = {
+ SD_BUS_VTABLE_START(0),
+
+ SD_BUS_WRITABLE_PROPERTY("LogLevel", "s", bus_property_get_log_level, bus_property_set_log_level, 0, 0),
+ SD_BUS_WRITABLE_PROPERTY("LogTarget", "s", bus_property_get_log_target, bus_property_set_log_target, 0, 0),
+ SD_BUS_PROPERTY("SyslogIdentifier", "s", bus_property_get_syslog_identifier, 0, 0),
+
+ /* One of those days we might want to add a similar, second interface to cover common service
+ * operations such as Reload(), Reexecute(), Exit() … and maybe some properties exposing version
+ * number and other meta-data of the service. */
+
+ SD_BUS_VTABLE_END,
+};
+
+const BusObjectImplementation log_control_object = {
+ "/org/freedesktop/LogControl1",
+ "org.freedesktop.LogControl1",
+ .vtables = BUS_VTABLES(log_control_vtable),
+};
diff --git a/src/shared/bus-log-control-api.h b/src/shared/bus-log-control-api.h
new file mode 100644
index 0000000..85f60a7
--- /dev/null
+++ b/src/shared/bus-log-control-api.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "bus-object.h"
+
+extern const BusObjectImplementation log_control_object;
+static inline int bus_log_control_api_register(sd_bus *bus) {
+ return bus_add_implementation(bus, &log_control_object, NULL);
+}
+
+int bus_property_get_log_level(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_set_log_level(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error);
+
+int bus_property_get_log_target(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+int bus_property_set_log_target(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
+
+int bus_property_get_syslog_identifier(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
diff --git a/src/shared/bus-map-properties.c b/src/shared/bus-map-properties.c
new file mode 100644
index 0000000..809759d
--- /dev/null
+++ b/src/shared/bus-map-properties.c
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-map-properties.h"
+#include "alloc-util.h"
+#include "bus-util.h"
+#include "strv.h"
+#include "bus-message.h"
+
+int bus_map_id128(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) {
+ sd_id128_t *p = userdata;
+ const void *v;
+ size_t n;
+ int r;
+
+ r = sd_bus_message_read_array(m, SD_BUS_TYPE_BYTE, &v, &n);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ if (n == 0)
+ *p = SD_ID128_NULL;
+ else if (n == 16)
+ memcpy((*p).bytes, v, n);
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+int bus_map_strv_sort(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata) {
+ _cleanup_strv_free_ char **l = NULL;
+ char ***p = userdata;
+ int r;
+
+ r = sd_bus_message_read_strv_extend(m, &l);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ r = strv_extend_strv(p, l, false);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ strv_sort(*p);
+ return 0;
+}
+
+static int map_basic(sd_bus *bus, const char *member, sd_bus_message *m, unsigned flags, sd_bus_error *error, void *userdata) {
+ char type;
+ int r;
+
+ r = sd_bus_message_peek_type(m, &type, NULL);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ switch (type) {
+
+ case SD_BUS_TYPE_STRING:
+ case SD_BUS_TYPE_OBJECT_PATH: {
+ const char **p = userdata;
+ const char *s;
+
+ r = sd_bus_message_read_basic(m, type, &s);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ if (isempty(s))
+ s = NULL;
+
+ if (flags & BUS_MAP_STRDUP)
+ return free_and_strdup((char **) userdata, s);
+
+ *p = s;
+ return 0;
+ }
+
+ case SD_BUS_TYPE_ARRAY: {
+ _cleanup_strv_free_ char **l = NULL;
+ char ***p = userdata;
+
+ r = sd_bus_message_read_strv_extend(m, &l);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ return strv_extend_strv(p, l, false);
+ }
+
+ case SD_BUS_TYPE_BOOLEAN: {
+ int b;
+
+ r = sd_bus_message_read_basic(m, type, &b);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ if (flags & BUS_MAP_BOOLEAN_AS_BOOL)
+ *(bool*) userdata = b;
+ else
+ *(int*) userdata = b;
+
+ return 0;
+ }
+
+ case SD_BUS_TYPE_INT32:
+ case SD_BUS_TYPE_UINT32: {
+ uint32_t u, *p = userdata;
+
+ r = sd_bus_message_read_basic(m, type, &u);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ *p = u;
+ return 0;
+ }
+
+ case SD_BUS_TYPE_INT64:
+ case SD_BUS_TYPE_UINT64: {
+ uint64_t t, *p = userdata;
+
+ r = sd_bus_message_read_basic(m, type, &t);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ *p = t;
+ return 0;
+ }
+
+ case SD_BUS_TYPE_DOUBLE: {
+ double d, *p = userdata;
+
+ r = sd_bus_message_read_basic(m, type, &d);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ *p = d;
+ return 0;
+ }}
+
+ return -EOPNOTSUPP;
+}
+
+int bus_message_map_all_properties(
+ sd_bus_message *m,
+ const struct bus_properties_map *map,
+ unsigned flags,
+ sd_bus_error *error,
+ void *userdata) {
+
+ int r;
+
+ assert(m);
+ assert(map);
+
+ r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "{sv}");
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ while ((r = sd_bus_message_enter_container(m, SD_BUS_TYPE_DICT_ENTRY, "sv")) > 0) {
+ const struct bus_properties_map *prop;
+ const char *member;
+ const char *contents;
+ void *v;
+ unsigned i;
+
+ r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &member);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ for (i = 0, prop = NULL; map[i].member; i++)
+ if (streq(map[i].member, member)) {
+ prop = &map[i];
+ break;
+ }
+
+ if (prop) {
+ r = sd_bus_message_peek_type(m, NULL, &contents);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ r = sd_bus_message_enter_container(m, SD_BUS_TYPE_VARIANT, contents);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ v = (uint8_t *)userdata + prop->offset;
+ if (map[i].set)
+ r = prop->set(sd_bus_message_get_bus(m), member, m, error, v);
+ else
+ r = map_basic(sd_bus_message_get_bus(m), member, m, flags, error, v);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+ } else {
+ r = sd_bus_message_skip(m, "v");
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+ }
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+ }
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return bus_log_parse_error_debug(r);
+
+ return r;
+}
+
+int bus_map_all_properties(
+ sd_bus *bus,
+ const char *destination,
+ const char *path,
+ const struct bus_properties_map *map,
+ unsigned flags,
+ sd_bus_error *error,
+ sd_bus_message **reply,
+ void *userdata) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ int r;
+
+ assert(bus);
+ assert(destination);
+ assert(path);
+ assert(map);
+ assert(reply || (flags & BUS_MAP_STRDUP));
+
+ r = sd_bus_call_method(
+ bus,
+ destination,
+ path,
+ "org.freedesktop.DBus.Properties",
+ "GetAll",
+ error,
+ &m,
+ "s", "");
+ if (r < 0)
+ return r;
+
+ r = bus_message_map_all_properties(m, map, flags, error, userdata);
+ if (r < 0)
+ return r;
+
+ if (reply)
+ *reply = sd_bus_message_ref(m);
+
+ return r;
+}
diff --git a/src/shared/bus-map-properties.h b/src/shared/bus-map-properties.h
new file mode 100644
index 0000000..e9f4a92
--- /dev/null
+++ b/src/shared/bus-map-properties.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+typedef int (*bus_property_set_t) (sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata);
+
+struct bus_properties_map {
+ const char *member;
+ const char *signature;
+ bus_property_set_t set;
+ size_t offset;
+};
+
+enum {
+ BUS_MAP_STRDUP = 1 << 0, /* If set, each "s" message is duplicated. Thus, each pointer needs to be freed. */
+ BUS_MAP_BOOLEAN_AS_BOOL = 1 << 1, /* If set, each "b" message is written to a bool pointer. If not set, "b" is written to an int pointer. */
+};
+
+int bus_map_id128(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata);
+int bus_map_strv_sort(sd_bus *bus, const char *member, sd_bus_message *m, sd_bus_error *error, void *userdata);
+
+int bus_message_map_all_properties(sd_bus_message *m, const struct bus_properties_map *map, unsigned flags, sd_bus_error *error, void *userdata);
+int bus_map_all_properties(sd_bus *bus, const char *destination, const char *path, const struct bus_properties_map *map,
+ unsigned flags, sd_bus_error *error, sd_bus_message **reply, void *userdata);
diff --git a/src/shared/bus-message-util.c b/src/shared/bus-message-util.c
new file mode 100644
index 0000000..53f6350
--- /dev/null
+++ b/src/shared/bus-message-util.c
@@ -0,0 +1,185 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-message-util.h"
+
+#include "resolve-util.h"
+
+int bus_message_read_ifindex(sd_bus_message *message, sd_bus_error *error, int *ret) {
+ int ifindex, r;
+
+ assert(message);
+ assert(ret);
+
+ assert_cc(sizeof(int) == sizeof(int32_t));
+
+ r = sd_bus_message_read(message, "i", &ifindex);
+ if (r < 0)
+ return r;
+
+ if (ifindex <= 0)
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid interface index");
+
+ *ret = ifindex;
+
+ return 0;
+}
+
+int bus_message_read_family(sd_bus_message *message, sd_bus_error *error, int *ret) {
+ int family, r;
+
+ assert(message);
+ assert(ret);
+
+ assert_cc(sizeof(int) == sizeof(int32_t));
+
+ r = sd_bus_message_read(message, "i", &family);
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(family, AF_INET, AF_INET6))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown address family %i", family);
+
+ *ret = family;
+ return 0;
+}
+
+int bus_message_read_in_addr_auto(sd_bus_message *message, sd_bus_error *error, int *ret_family, union in_addr_union *ret_addr) {
+ int family, r;
+ const void *d;
+ size_t sz;
+
+ assert(message);
+
+ r = sd_bus_message_read(message, "i", &family);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_array(message, 'y', &d, &sz);
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(family, AF_INET, AF_INET6))
+ return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Unknown address family %i", family);
+
+ if (sz != FAMILY_ADDRESS_SIZE(family))
+ return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid address size");
+
+ if (ret_family)
+ *ret_family = family;
+ if (ret_addr)
+ memcpy(ret_addr, d, sz);
+ return 0;
+}
+
+static int bus_message_read_dns_one(
+ sd_bus_message *message,
+ sd_bus_error *error,
+ bool extended,
+ int *ret_family,
+ union in_addr_union *ret_address,
+ uint16_t *ret_port,
+ const char **ret_server_name) {
+ const char *server_name = NULL;
+ union in_addr_union a;
+ uint16_t port = 0;
+ int family, r;
+
+ assert(message);
+ assert(ret_family);
+ assert(ret_address);
+ assert(ret_port);
+ assert(ret_server_name);
+
+ r = sd_bus_message_enter_container(message, 'r', extended ? "iayqs" : "iay");
+ if (r <= 0)
+ return r;
+
+ r = bus_message_read_in_addr_auto(message, error, &family, &a);
+ if (r < 0)
+ return r;
+
+ if (!dns_server_address_valid(family, &a)) {
+ r = sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Invalid DNS server address");
+ assert(r < 0);
+ return r;
+ }
+
+ if (extended) {
+ r = sd_bus_message_read(message, "q", &port);
+ if (r < 0)
+ return r;
+
+ if (IN_SET(port, 53, 853))
+ port = 0;
+
+ r = sd_bus_message_read(message, "s", &server_name);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(message);
+ if (r < 0)
+ return r;
+
+ *ret_family = family;
+ *ret_address = a;
+ *ret_port = port;
+ *ret_server_name = server_name;
+
+ return 1;
+}
+
+int bus_message_read_dns_servers(
+ sd_bus_message *message,
+ sd_bus_error *error,
+ bool extended,
+ struct in_addr_full ***ret_dns,
+ size_t *ret_n_dns) {
+
+ struct in_addr_full **dns = NULL;
+ size_t n = 0;
+ int r;
+
+ assert(message);
+ assert(ret_dns);
+ assert(ret_n_dns);
+
+ r = sd_bus_message_enter_container(message, 'a', extended ? "(iayqs)" : "(iay)");
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const char *server_name;
+ union in_addr_union a;
+ uint16_t port;
+ int family;
+
+ r = bus_message_read_dns_one(message, error, extended, &family, &a, &port, &server_name);
+ if (r < 0)
+ goto clear;
+ if (r == 0)
+ break;
+
+ if (!GREEDY_REALLOC(dns, n+1)) {
+ r = -ENOMEM;
+ goto clear;
+ }
+
+ r = in_addr_full_new(family, &a, port, 0, server_name, dns + n);
+ if (r < 0)
+ goto clear;
+
+ n++;
+ }
+
+ *ret_dns = TAKE_PTR(dns);
+ *ret_n_dns = n;
+ return 0;
+
+clear:
+ for (size_t i = 0; i < n; i++)
+ in_addr_full_free(dns[i]);
+ free(dns);
+
+ return r;
+}
diff --git a/src/shared/bus-message-util.h b/src/shared/bus-message-util.h
new file mode 100644
index 0000000..b82c083
--- /dev/null
+++ b/src/shared/bus-message-util.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "in-addr-util.h"
+#include "socket-netlink.h"
+
+int bus_message_read_ifindex(sd_bus_message *message, sd_bus_error *error, int *ret);
+int bus_message_read_family(sd_bus_message *message, sd_bus_error *error, int *ret);
+int bus_message_read_in_addr_auto(sd_bus_message *message, sd_bus_error *error, int *ret_family, union in_addr_union *ret_addr);
+
+int bus_message_read_dns_servers(
+ sd_bus_message *message,
+ sd_bus_error *error,
+ bool extended,
+ struct in_addr_full ***ret_dns,
+ size_t *ret_n_dns);
diff --git a/src/shared/bus-object.c b/src/shared/bus-object.c
new file mode 100644
index 0000000..4ed5215
--- /dev/null
+++ b/src/shared/bus-object.c
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-introspect.h"
+#include "bus-object.h"
+#include "macro.h"
+#include "string-util.h"
+#include "strv.h"
+
+int bus_add_implementation(sd_bus *bus, const BusObjectImplementation *impl, void *userdata) {
+ int r;
+
+ log_debug("Registering bus object implementation for path=%s iface=%s", impl->path, impl->interface);
+
+ for (const sd_bus_vtable **p = impl->vtables; p && *p; p++) {
+ r = sd_bus_add_object_vtable(bus, NULL,
+ impl->path,
+ impl->interface,
+ *p,
+ userdata);
+ if (r < 0)
+ return log_error_errno(r, "Failed to register bus path %s with interface %s: %m",
+ impl->path,
+ impl->interface);
+ }
+
+ for (const BusObjectVtablePair *p = impl->fallback_vtables; p && p->vtable; p++) {
+ r = sd_bus_add_fallback_vtable(bus, NULL,
+ impl->path,
+ impl->interface,
+ p->vtable,
+ p->object_find,
+ userdata);
+ if (r < 0)
+ return log_error_errno(r, "Failed to register bus path %s with interface %s: %m",
+ impl->path,
+ impl->interface);
+ }
+
+ if (impl->node_enumerator) {
+ r = sd_bus_add_node_enumerator(bus, NULL,
+ impl->path,
+ impl->node_enumerator,
+ userdata);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add node enumerator for %s: %m",
+ impl->path);
+ }
+
+ if (impl->manager) {
+ r = sd_bus_add_object_manager(bus, NULL, impl->path);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add object manager for %s: %m", impl->path);
+ }
+
+ for (size_t i = 0; impl->children && impl->children[i]; i++) {
+ r = bus_add_implementation(bus, impl->children[i], userdata);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static const BusObjectImplementation* find_implementation(
+ const char *pattern,
+ const BusObjectImplementation* const* bus_objects) {
+
+ for (size_t i = 0; bus_objects && bus_objects[i]; i++) {
+ const BusObjectImplementation *impl = bus_objects[i];
+
+ if (STR_IN_SET(pattern, impl->path, impl->interface))
+ return impl;
+
+ impl = find_implementation(pattern, impl->children);
+ if (impl)
+ return impl;
+ }
+
+ return NULL;
+}
+
+static int bus_introspect_implementation(
+ struct introspect *intro,
+ const BusObjectImplementation *impl) {
+ int r;
+
+ for (const sd_bus_vtable **p = impl->vtables; p && *p; p++) {
+ r = introspect_write_interface(intro, impl->interface, *p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write introspection data: %m");
+ }
+
+ for (const BusObjectVtablePair *p = impl->fallback_vtables; p && p->vtable; p++) {
+ r = introspect_write_interface(intro, impl->interface, p->vtable);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write introspection data: %m");
+ }
+
+ return 0;
+}
+
+static void list_paths(
+ FILE *out,
+ const BusObjectImplementation* const* bus_objects) {
+
+ for (size_t i = 0; bus_objects[i]; i++) {
+ fprintf(out, "%s\t%s\n", bus_objects[i]->path, bus_objects[i]->interface);
+ if (bus_objects[i]->children)
+ list_paths(out, bus_objects[i]->children);
+ }
+}
+
+int bus_introspect_implementations(
+ FILE *out,
+ const char *pattern,
+ const BusObjectImplementation* const* bus_objects) {
+
+ const BusObjectImplementation *impl, *main_impl = NULL;
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ if (streq(pattern, "list")) {
+ list_paths(out, bus_objects);
+ return 0;
+ }
+
+ struct introspect intro = {};
+ bool is_interface = sd_bus_interface_name_is_valid(pattern);
+
+ impl = find_implementation(pattern, bus_objects);
+ if (!impl)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+ "%s %s not found",
+ is_interface ? "Interface" : "Object path",
+ pattern);
+
+ /* We use trusted=false here to get all the @org.freedesktop.systemd1.Privileged annotations. */
+ r = introspect_begin(&intro, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write introspection data: %m");
+
+ r = introspect_write_default_interfaces(&intro, impl->manager);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write introspection data: %m");
+
+ /* Check if there is a non-fallback path that applies to the given interface, also
+ * print it. This is useful in the case of units: o.fd.systemd1.Service is declared
+ * as a fallback vtable for o/fd/systemd1/unit, and we also want to print
+ * o.fd.systemd1.Unit, which is the non-fallback implementation. */
+ if (impl->fallback_vtables && is_interface)
+ main_impl = find_implementation(impl->path, bus_objects);
+
+ if (main_impl)
+ bus_introspect_implementation(&intro, main_impl);
+
+ if (impl != main_impl)
+ bus_introspect_implementation(&intro, impl);
+
+ _cleanup_ordered_set_free_ OrderedSet *nodes = NULL;
+
+ for (size_t i = 0; impl->children && impl->children[i]; i++) {
+ r = ordered_set_put_strdup(&nodes, impl->children[i]->path);
+ if (r < 0)
+ return log_oom();
+ }
+
+ r = introspect_write_child_nodes(&intro, nodes, impl->path);
+ if (r < 0)
+ return r;
+
+ r = introspect_finish(&intro, &s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write introspection data: %m");
+
+ fputs(s, out);
+ return 0;
+}
diff --git a/src/shared/bus-object.h b/src/shared/bus-object.h
new file mode 100644
index 0000000..145bbd2
--- /dev/null
+++ b/src/shared/bus-object.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "sd-bus.h"
+
+typedef struct BusObjectImplementation BusObjectImplementation;
+
+typedef struct BusObjectVtablePair {
+ const sd_bus_vtable *vtable;
+ sd_bus_object_find_t object_find;
+} BusObjectVtablePair;
+
+struct BusObjectImplementation {
+ const char *path;
+ const char *interface;
+ const sd_bus_vtable **vtables;
+ const BusObjectVtablePair *fallback_vtables;
+ sd_bus_node_enumerator_t node_enumerator;
+ bool manager;
+ const BusObjectImplementation **children;
+};
+
+#define BUS_VTABLES(...) ((const sd_bus_vtable* []){ __VA_ARGS__, NULL })
+#define BUS_FALLBACK_VTABLES(...) ((const BusObjectVtablePair[]) { __VA_ARGS__, {} })
+#define BUS_IMPLEMENTATIONS(...) ((const BusObjectImplementation* []) { __VA_ARGS__, NULL })
+
+int bus_add_implementation(sd_bus *bus, const BusObjectImplementation *impl, void *userdata);
+int bus_introspect_implementations(
+ FILE *out,
+ const char *pattern,
+ const BusObjectImplementation* const* bus_objects);
diff --git a/src/shared/bus-polkit.c b/src/shared/bus-polkit.c
new file mode 100644
index 0000000..904b897
--- /dev/null
+++ b/src/shared/bus-polkit.c
@@ -0,0 +1,575 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-internal.h"
+#include "bus-message.h"
+#include "bus-polkit.h"
+#include "bus-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+static int check_good_user(sd_bus_message *m, uid_t good_user) {
+ _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
+ uid_t sender_uid;
+ int r;
+
+ assert(m);
+
+ if (good_user == UID_INVALID)
+ return 0;
+
+ r = sd_bus_query_sender_creds(m, SD_BUS_CREDS_EUID, &creds);
+ if (r < 0)
+ return r;
+
+ /* Don't trust augmented credentials for authorization */
+ assert_return((sd_bus_creds_get_augmented_mask(creds) & SD_BUS_CREDS_EUID) == 0, -EPERM);
+
+ r = sd_bus_creds_get_euid(creds, &sender_uid);
+ if (r < 0)
+ return r;
+
+ return sender_uid == good_user;
+}
+
+#if ENABLE_POLKIT
+static int bus_message_append_strv_key_value(sd_bus_message *m, const char **l) {
+ int r;
+
+ assert(m);
+
+ r = sd_bus_message_open_container(m, 'a', "{ss}");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH_PAIR(k, v, l) {
+ r = sd_bus_message_append(m, "{ss}", *k, *v);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return r;
+
+ return r;
+}
+
+static int bus_message_new_polkit_auth_call(
+ sd_bus_message *m,
+ const char *action,
+ const char **details,
+ bool interactive,
+ sd_bus_message **ret) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *c = NULL;
+ const char *sender;
+ int r;
+
+ assert(m);
+ assert(action);
+ assert(ret);
+
+ sender = sd_bus_message_get_sender(m);
+ if (!sender)
+ return -EBADMSG;
+
+ r = sd_bus_message_new_method_call(
+ ASSERT_PTR(m->bus),
+ &c,
+ "org.freedesktop.PolicyKit1",
+ "/org/freedesktop/PolicyKit1/Authority",
+ "org.freedesktop.PolicyKit1.Authority",
+ "CheckAuthorization");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(c, "(sa{sv})s", "system-bus-name", 1, "name", "s", sender, action);
+ if (r < 0)
+ return r;
+
+ r = bus_message_append_strv_key_value(c, details);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(c, "us", interactive, NULL);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(c);
+ return 0;
+}
+#endif
+
+int bus_test_polkit(
+ sd_bus_message *call,
+ int capability,
+ const char *action,
+ const char **details,
+ uid_t good_user,
+ bool *_challenge,
+ sd_bus_error *ret_error) {
+
+ int r;
+
+ assert(call);
+ assert(action);
+
+ /* Tests non-interactively! */
+
+ r = check_good_user(call, good_user);
+ if (r != 0)
+ return r;
+
+ r = sd_bus_query_sender_privilege(call, capability);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 1;
+
+#if ENABLE_POLKIT
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *request = NULL, *reply = NULL;
+ int authorized = false, challenge = false;
+
+ r = bus_message_new_polkit_auth_call(call, action, details, /* interactive = */ false, &request);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_call(call->bus, request, 0, ret_error, &reply);
+ if (r < 0) {
+ /* Treat no PK available as access denied */
+ if (bus_error_is_unknown_service(ret_error)) {
+ sd_bus_error_free(ret_error);
+ return -EACCES;
+ }
+
+ return r;
+ }
+
+ r = sd_bus_message_enter_container(reply, 'r', "bba{ss}");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read(reply, "bb", &authorized, &challenge);
+ if (r < 0)
+ return r;
+
+ if (authorized)
+ return 1;
+
+ if (_challenge) {
+ *_challenge = challenge;
+ return 0;
+ }
+#endif
+
+ return -EACCES;
+}
+
+#if ENABLE_POLKIT
+
+typedef struct AsyncPolkitQueryAction {
+ char *action;
+ char **details;
+
+ LIST_FIELDS(struct AsyncPolkitQueryAction, authorized);
+} AsyncPolkitQueryAction;
+
+static AsyncPolkitQueryAction *async_polkit_query_action_free(AsyncPolkitQueryAction *a) {
+ if (!a)
+ return NULL;
+
+ free(a->action);
+ strv_free(a->details);
+
+ return mfree(a);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(AsyncPolkitQueryAction*, async_polkit_query_action_free);
+
+typedef struct AsyncPolkitQuery {
+ unsigned n_ref;
+
+ AsyncPolkitQueryAction *action;
+
+ sd_bus_message *request;
+ sd_bus_slot *slot;
+
+ Hashmap *registry;
+ sd_event_source *defer_event_source;
+
+ LIST_HEAD(AsyncPolkitQueryAction, authorized_actions);
+ AsyncPolkitQueryAction *denied_action;
+ AsyncPolkitQueryAction *error_action;
+ sd_bus_error error;
+} AsyncPolkitQuery;
+
+static AsyncPolkitQuery *async_polkit_query_free(AsyncPolkitQuery *q) {
+ if (!q)
+ return NULL;
+
+ sd_bus_slot_unref(q->slot);
+
+ if (q->registry && q->request)
+ hashmap_remove(q->registry, q->request);
+
+ sd_bus_message_unref(q->request);
+
+ async_polkit_query_action_free(q->action);
+
+ sd_event_source_disable_unref(q->defer_event_source);
+
+ LIST_CLEAR(authorized, q->authorized_actions, async_polkit_query_action_free);
+
+ async_polkit_query_action_free(q->denied_action);
+ async_polkit_query_action_free(q->error_action);
+
+ sd_bus_error_free(&q->error);
+
+ return mfree(q);
+}
+
+DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(AsyncPolkitQuery, async_polkit_query, async_polkit_query_free);
+DEFINE_TRIVIAL_CLEANUP_FUNC(AsyncPolkitQuery*, async_polkit_query_unref);
+
+static int async_polkit_defer(sd_event_source *s, void *userdata) {
+ AsyncPolkitQuery *q = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ /* This is called as idle event source after we processed the async polkit reply, hopefully after the
+ * method call we re-enqueued has been properly processed. */
+
+ async_polkit_query_unref(q);
+ return 0;
+}
+
+static int async_polkit_read_reply(sd_bus_message *reply, AsyncPolkitQuery *q) {
+ _cleanup_(async_polkit_query_action_freep) AsyncPolkitQueryAction *a = NULL;
+ int authorized, challenge, r;
+
+ assert(reply);
+ assert(q);
+
+ /* Processing of a PolicyKit checks is canceled on the first auth. error. */
+ assert(!q->denied_action);
+ assert(!q->error_action);
+ assert(!sd_bus_error_is_set(&q->error));
+
+ assert(q->action);
+ a = TAKE_PTR(q->action);
+
+ if (sd_bus_message_is_method_error(reply, NULL)) {
+ const sd_bus_error *e;
+
+ e = sd_bus_message_get_error(reply);
+
+ if (bus_error_is_unknown_service(e))
+ /* Treat no PK available as access denied */
+ q->denied_action = TAKE_PTR(a);
+ else {
+ /* Save error from polkit reply, so it can be returned when the same authorization
+ * is attempted for second time */
+ q->error_action = TAKE_PTR(a);
+ r = sd_bus_error_copy(&q->error, e);
+ if (r == -ENOMEM)
+ return r;
+ }
+
+ return 0;
+ }
+
+ r = sd_bus_message_enter_container(reply, 'r', "bba{ss}");
+ if (r >= 0)
+ r = sd_bus_message_read(reply, "bb", &authorized, &challenge);
+ if (r < 0)
+ return r;
+
+ if (authorized)
+ LIST_PREPEND(authorized, q->authorized_actions, TAKE_PTR(a));
+ else if (challenge) {
+ q->error_action = TAKE_PTR(a);
+ sd_bus_error_set_const(&q->error, SD_BUS_ERROR_INTERACTIVE_AUTHORIZATION_REQUIRED, "Interactive authentication required.");
+ } else
+ q->denied_action = TAKE_PTR(a);
+
+ return 0;
+}
+
+static int async_polkit_process_reply(sd_bus_message *reply, AsyncPolkitQuery *q) {
+ int r;
+
+ assert(reply);
+ assert(q);
+
+ assert(q->slot);
+ q->slot = sd_bus_slot_unref(q->slot);
+
+ r = async_polkit_read_reply(reply, q);
+ if (r < 0)
+ return r;
+
+ /* Now, let's dispatch the original message a second time be re-enqueing. This will then traverse the
+ * whole message processing again, and thus re-validating and re-retrieving the "userdata" field
+ * again.
+ *
+ * We install an idle event loop event to clean-up the PolicyKit request data when we are idle again,
+ * i.e. after the second time the message is processed is complete. */
+
+ if (!q->defer_event_source) {
+ r = sd_event_add_defer(
+ sd_bus_get_event(sd_bus_message_get_bus(reply)),
+ &q->defer_event_source,
+ async_polkit_defer,
+ q);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(q->defer_event_source, SD_EVENT_PRIORITY_IDLE);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_event_source_set_enabled(q->defer_event_source, SD_EVENT_ONESHOT);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_rewind(q->request, true);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_enqueue_for_read(sd_bus_message_get_bus(q->request), q->request);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int async_polkit_callback(sd_bus_message *reply, void *userdata, sd_bus_error *error) {
+ AsyncPolkitQuery *q = ASSERT_PTR(userdata);
+ int r;
+
+ assert(reply);
+
+ r = async_polkit_process_reply(reply, q);
+ if (r < 0) {
+ log_debug_errno(r, "Processing asynchronous PolicyKit reply failed, ignoring: %m");
+ (void) sd_bus_reply_method_errno(q->request, r, NULL);
+ async_polkit_query_unref(q);
+ }
+ return r;
+}
+
+static int async_polkit_query_check_action(
+ AsyncPolkitQuery *q,
+ const char *action,
+ const char **details,
+ sd_bus_error *ret_error) {
+
+ assert(q);
+ assert(action);
+ assert(ret_error);
+
+ LIST_FOREACH(authorized, a, q->authorized_actions)
+ if (streq(a->action, action) && strv_equal(a->details, (char**) details))
+ return 1;
+
+ if (q->error_action && streq(q->error_action->action, action))
+ return sd_bus_error_copy(ret_error, &q->error);
+
+ if (q->denied_action && streq(q->denied_action->action, action))
+ return -EACCES;
+
+ return 0;
+}
+
+#endif
+
+/* bus_verify_polkit_async() handles verification of D-Bus calls with polkit. Because the polkit API
+ * is asynchronous, the whole thing is a bit complex and requires some support in the code that uses
+ * it. It relies on sd-bus's support for interrupting the processing of a message.
+ *
+ * Requirements:
+ *
+ * * bus_verify_polkit_async() must be called before any changes to internal state.
+ * * If bus_verify_polkit_async() has made a new polkit query (signaled by return value 0),
+ * processing of the message should be interrupted. This is done by returning 1--which sd-bus
+ * handles specially--and is usually accompanied by a comment. (The message will be queued for
+ * processing again later when a reply from polkit is received.)
+ * * The code needs to keep a hashmap, here called registry, in which bus_verify_polkit_async()
+ * stores active queries. This hashmap's lifetime must be larger than the method handler's;
+ * e.g., it can be a member of some "manager" object or a global variable.
+ *
+ * Return value:
+ *
+ * * 0 - a new polkit call has been made, which means the processing of the message should be
+ * interrupted;
+ * * 1 - the action has been allowed;
+ * * -EACCES - the action has been denied;
+ * * < 0 - an unspecified error.
+ *
+ * A step-by-step description of how it works:
+ *
+ * 1. A D-Bus method handler calls bus_verify_polkit_async(), passing it the D-Bus message being
+ * processed and the polkit action to verify.
+ * 2. bus_verify_polkit_async() checks the registry for an existing query object associated with the
+ * message. Let's assume this is the first call, so it finds nothing.
+ * 3. A new AsyncPolkitQuery object is created and an async. D-Bus call to polkit is made. The
+ * function then returns 0. The method handler returns 1 to tell sd-bus that the processing of
+ * the message has been interrupted.
+ * 4. (Later) A reply from polkit is received and async_polkit_callback() is called.
+ * 5. async_polkit_callback() reads the reply and stores its result in the passed query.
+ * 6. async_polkit_callback() enqueues the original message again.
+ * 7. (Later) The same D-Bus method handler is called for the same message. It calls
+ * bus_verify_polkit_async() again.
+ * 8. bus_verify_polkit_async() checks the registry for an existing query object associated with the
+ * message. It finds one and returns the result for the action.
+ * 9. The method handler continues processing of the message. If there's another action that needs
+ * to be verified:
+ * 10. bus_verify_polkit_async() is called again for the new action. The registry already contains a
+ * query for the message, but the new action hasn't been seen yet, hence steps 4-8 are repeated.
+ * 11. (In the method handler again.) bus_verify_polkit_async() returns query results for both
+ * actions and the processing continues as in step 9.
+ *
+ * Memory handling:
+ *
+ * async_polkit_callback() registers a deferred call of async_polkit_defer() for the query, which
+ * causes the query to be removed from the registry and freed. Deferred events are run with idle
+ * priority, so this will happen after processing of the D-Bus message, when the query is no longer
+ * needed.
+ *
+ * Schematically:
+ *
+ * (m - D-Bus message, a - polkit action, q - polkit query)
+ *
+ * -> foo_method(m)
+ * -> bus_verify_polkit_async(m, a)
+ * -> async_polkit_query_ref(q)
+ * -> bus_call_method_async(q)
+ * <- bus_verify_polkit_async(m, a) = 0
+ * <- foo_method(m) = 1
+ * ...
+ * -> async_polkit_callback(q)
+ * -> sd_event_add_defer(async_polkit_defer, q)
+ * -> sd_bus_enqueue_for_read(m)
+ * <- async_polkit_callback(q)
+ * ...
+ * -> foo_method(m)
+ * -> bus_verify_polkit_async(m, a)
+ * <- bus_verify_polkit_async(m, a) = 1/-EACCES/error
+ * ...
+ * // possibly another call to bus_verify_polkit_async with action a2
+ * <- foo_method(m)
+ * ...
+ * -> async_polkit_defer(q)
+ * -> async_polkit_query_unref(q)
+ * <- async_polkit_defer(q)
+ */
+
+int bus_verify_polkit_async(
+ sd_bus_message *call,
+ int capability,
+ const char *action,
+ const char **details,
+ bool interactive,
+ uid_t good_user,
+ Hashmap **registry,
+ sd_bus_error *ret_error) {
+
+ int r;
+
+ assert(call);
+ assert(action);
+ assert(registry);
+ assert(ret_error);
+
+ r = check_good_user(call, good_user);
+ if (r != 0)
+ return r;
+
+#if ENABLE_POLKIT
+ _cleanup_(async_polkit_query_unrefp) AsyncPolkitQuery *q = NULL;
+
+ q = async_polkit_query_ref(hashmap_get(*registry, call));
+ /* This is a repeated invocation of this function, hence let's check if we've already got
+ * a response from polkit for this action */
+ if (q) {
+ r = async_polkit_query_check_action(q, action, details, ret_error);
+ if (r != 0)
+ return r;
+ }
+#endif
+
+ r = sd_bus_query_sender_privilege(call, capability);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 1;
+
+#if ENABLE_POLKIT
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *pk = NULL;
+
+ int c = sd_bus_message_get_allow_interactive_authorization(call);
+ if (c < 0)
+ return c;
+ if (c > 0)
+ interactive = true;
+
+ r = hashmap_ensure_allocated(registry, NULL);
+ if (r < 0)
+ return r;
+
+ r = bus_message_new_polkit_auth_call(call, action, details, interactive, &pk);
+ if (r < 0)
+ return r;
+
+ if (!q) {
+ q = new(AsyncPolkitQuery, 1);
+ if (!q)
+ return -ENOMEM;
+
+ *q = (AsyncPolkitQuery) {
+ .n_ref = 1,
+ .request = sd_bus_message_ref(call),
+ };
+ }
+
+ assert(!q->action);
+ q->action = new(AsyncPolkitQueryAction, 1);
+ if (!q->action)
+ return -ENOMEM;
+
+ *q->action = (AsyncPolkitQueryAction) {
+ .action = strdup(action),
+ .details = strv_copy((char**) details),
+ };
+ if (!q->action->action || !q->action->details)
+ return -ENOMEM;
+
+ if (!q->registry) {
+ r = hashmap_put(*registry, call, q);
+ if (r < 0)
+ return r;
+
+ q->registry = *registry;
+ }
+
+ r = sd_bus_call_async(call->bus, &q->slot, pk, async_polkit_callback, q, 0);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(q);
+
+ return 0;
+#endif
+
+ return -EACCES;
+}
+
+Hashmap *bus_verify_polkit_async_registry_free(Hashmap *registry) {
+#if ENABLE_POLKIT
+ return hashmap_free_with_destructor(registry, async_polkit_query_unref);
+#else
+ assert(hashmap_isempty(registry));
+ return hashmap_free(registry);
+#endif
+}
diff --git a/src/shared/bus-polkit.h b/src/shared/bus-polkit.h
new file mode 100644
index 0000000..e2a3b7e
--- /dev/null
+++ b/src/shared/bus-polkit.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "hashmap.h"
+
+int bus_test_polkit(sd_bus_message *call, int capability, const char *action, const char **details, uid_t good_user, bool *_challenge, sd_bus_error *e);
+
+int bus_verify_polkit_async(sd_bus_message *call, int capability, const char *action, const char **details, bool interactive, uid_t good_user, Hashmap **registry, sd_bus_error *error);
+Hashmap *bus_verify_polkit_async_registry_free(Hashmap *registry);
diff --git a/src/shared/bus-print-properties.c b/src/shared/bus-print-properties.c
new file mode 100644
index 0000000..6704e1e
--- /dev/null
+++ b/src/shared/bus-print-properties.c
@@ -0,0 +1,440 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-print-properties.h"
+#include "cap-list.h"
+#include "cgroup-util.h"
+#include "escape.h"
+#include "mountpoint-util.h"
+#include "nsflags.h"
+#include "parse-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "time-util.h"
+#include "user-util.h"
+
+int bus_print_property_value(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *value) {
+ assert(name);
+
+ if (expected_value && !streq_ptr(expected_value, value))
+ return 0;
+
+ if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) && isempty(value))
+ return 0;
+
+ if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE))
+ puts(strempty(value));
+ else
+ printf("%s=%s\n", name, strempty(value));
+
+ return 0;
+}
+
+int bus_print_property_valuef(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *fmt, ...) {
+ _cleanup_free_ char *s = NULL;
+ va_list ap;
+ int r;
+
+ assert(name);
+ assert(fmt);
+
+ va_start(ap, fmt);
+ r = vasprintf(&s, fmt, ap);
+ va_end(ap);
+ if (r < 0)
+ return -ENOMEM;
+
+ return bus_print_property_value(name, expected_value, flags, s);
+}
+
+static int bus_print_property(const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags) {
+ char type;
+ const char *contents;
+ int r;
+
+ assert(name);
+ assert(m);
+
+ r = sd_bus_message_peek_type(m, &type, &contents);
+ if (r < 0)
+ return r;
+
+ switch (type) {
+
+ case SD_BUS_TYPE_STRING: {
+ const char *s;
+
+ r = sd_bus_message_read_basic(m, type, &s);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || !isempty(s)) {
+ bool good;
+
+ /* This property has a single value, so we need to take
+ * care not to print a new line, everything else is OK. */
+ good = !strchr(s, '\n');
+ bus_print_property_value(name, expected_value, flags, good ? s : "[unprintable]");
+ }
+
+ return 1;
+ }
+
+ case SD_BUS_TYPE_BOOLEAN: {
+ int b;
+
+ r = sd_bus_message_read_basic(m, type, &b);
+ if (r < 0)
+ return r;
+
+ if (expected_value && parse_boolean(expected_value) != b)
+ return 1;
+
+ bus_print_property_value(name, NULL, flags, yes_no(b));
+ return 1;
+ }
+
+ case SD_BUS_TYPE_UINT64: {
+ uint64_t u;
+
+ r = sd_bus_message_read_basic(m, type, &u);
+ if (r < 0)
+ return r;
+
+ /* Yes, heuristics! But we can change this check
+ * should it turn out to not be sufficient */
+
+ if (endswith(name, "Timestamp") ||
+ STR_IN_SET(name, "NextElapseUSecRealtime", "LastTriggerUSec", "TimeUSec", "RTCTimeUSec"))
+
+ bus_print_property_value(name, expected_value, flags, FORMAT_TIMESTAMP(u));
+
+ else if (strstr(name, "USec"))
+ bus_print_property_value(name, expected_value, flags, FORMAT_TIMESPAN(u, 0));
+
+ else if (streq(name, "CoredumpFilter"))
+ bus_print_property_valuef(name, expected_value, flags, "0x%"PRIx64, u);
+
+ else if (streq(name, "RestrictNamespaces")) {
+ _cleanup_free_ char *s = NULL;
+ const char *result;
+
+ if ((u & NAMESPACE_FLAGS_ALL) == 0)
+ result = "yes";
+ else if (FLAGS_SET(u, NAMESPACE_FLAGS_ALL))
+ result = "no";
+ else {
+ r = namespace_flags_to_string(u, &s);
+ if (r < 0)
+ return r;
+
+ result = s;
+ }
+
+ bus_print_property_value(name, expected_value, flags, result);
+
+ } else if (streq(name, "MountFlags")) {
+ const char *result;
+
+ result = mount_propagation_flag_to_string(u);
+ if (!result)
+ return -EINVAL;
+
+ bus_print_property_value(name, expected_value, flags, result);
+
+ } else if (STR_IN_SET(name, "CapabilityBoundingSet", "AmbientCapabilities")) {
+ _cleanup_free_ char *s = NULL;
+
+ r = capability_set_to_string(u, &s);
+ if (r < 0)
+ return r;
+
+ bus_print_property_value(name, expected_value, flags, s);
+
+ } else if (STR_IN_SET(name, "CPUWeight", "StartupCPUWeight") && u == CGROUP_WEIGHT_IDLE)
+ bus_print_property_value(name, expected_value, flags, "idle");
+
+ else if ((STR_IN_SET(name, "CPUWeight", "StartupCPUWeight", "IOWeight", "StartupIOWeight") && u == CGROUP_WEIGHT_INVALID) ||
+ (STR_IN_SET(name, "CPUShares", "StartupCPUShares") && u == CGROUP_CPU_SHARES_INVALID) ||
+ (STR_IN_SET(name, "BlockIOWeight", "StartupBlockIOWeight") && u == CGROUP_BLKIO_WEIGHT_INVALID) ||
+ (STR_IN_SET(name, "MemoryCurrent", "MemoryAvailable", "TasksCurrent") && u == UINT64_MAX) ||
+ (startswith(name, "Memory") && ENDSWITH_SET(name, "Current", "Peak") && u == CGROUP_LIMIT_MAX) ||
+ (startswith(name, "IO") && ENDSWITH_SET(name, "Bytes", "Operations") && u == UINT64_MAX) ||
+ (endswith(name, "NSec") && u == UINT64_MAX))
+
+ bus_print_property_value(name, expected_value, flags, "[not set]");
+
+ else if ((ENDSWITH_SET(name, "MemoryLow", "MemoryMin", "MemoryHigh", "MemoryMax", "MemorySwapMax", "MemoryZSwapMax", "MemoryLimit") &&
+ u == CGROUP_LIMIT_MAX) ||
+ (STR_IN_SET(name, "TasksMax", "DefaultTasksMax") && u == UINT64_MAX) ||
+ (startswith(name, "Limit") && u == UINT64_MAX) ||
+ (startswith(name, "DefaultLimit") && u == UINT64_MAX))
+
+ bus_print_property_value(name, expected_value, flags, "infinity");
+ else if (STR_IN_SET(name, "IPIngressBytes", "IPIngressPackets", "IPEgressBytes", "IPEgressPackets") && u == UINT64_MAX)
+ bus_print_property_value(name, expected_value, flags, "[no data]");
+ else
+ bus_print_property_valuef(name, expected_value, flags, "%"PRIu64, u);
+
+ return 1;
+ }
+
+ case SD_BUS_TYPE_INT64: {
+ int64_t i;
+
+ r = sd_bus_message_read_basic(m, type, &i);
+ if (r < 0)
+ return r;
+
+ bus_print_property_valuef(name, expected_value, flags, "%"PRIi64, i);
+ return 1;
+ }
+
+ case SD_BUS_TYPE_UINT32: {
+ uint32_t u;
+
+ r = sd_bus_message_read_basic(m, type, &u);
+ if (r < 0)
+ return r;
+
+ if (strstr(name, "UMask") || strstr(name, "Mode"))
+ bus_print_property_valuef(name, expected_value, flags, "%04o", u);
+
+ else if (streq(name, "UID")) {
+ if (u == UID_INVALID)
+ bus_print_property_value(name, expected_value, flags, "[not set]");
+ else
+ bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u);
+ } else if (streq(name, "GID")) {
+ if (u == GID_INVALID)
+ bus_print_property_value(name, expected_value, flags, "[not set]");
+ else
+ bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u);
+ } else
+ bus_print_property_valuef(name, expected_value, flags, "%"PRIu32, u);
+
+ return 1;
+ }
+
+ case SD_BUS_TYPE_INT32: {
+ int32_t i;
+
+ r = sd_bus_message_read_basic(m, type, &i);
+ if (r < 0)
+ return r;
+
+ bus_print_property_valuef(name, expected_value, flags, "%"PRIi32, i);
+ return 1;
+ }
+
+ case SD_BUS_TYPE_DOUBLE: {
+ double d;
+
+ r = sd_bus_message_read_basic(m, type, &d);
+ if (r < 0)
+ return r;
+
+ bus_print_property_valuef(name, expected_value, flags, "%g", d);
+ return 1;
+ }
+
+ case SD_BUS_TYPE_ARRAY:
+ if (streq(contents, "s")) {
+ bool first = true;
+ const char *str;
+
+ r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, contents);
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &str)) > 0) {
+ _cleanup_free_ char *e = NULL;
+
+ e = shell_maybe_quote(str, 0);
+ if (!e)
+ return -ENOMEM;
+
+ if (first) {
+ if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE))
+ printf("%s=", name);
+ first = false;
+ } else
+ fputs(" ", stdout);
+
+ fputs(e, stdout);
+ }
+ if (r < 0)
+ return r;
+
+ if (first && FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) && !FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE))
+ printf("%s=", name);
+ if (!first || FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY))
+ puts("");
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return r;
+
+ return 1;
+
+ } else if (streq(contents, "y")) {
+ const uint8_t *u;
+ size_t n;
+
+ r = sd_bus_message_read_array(m, SD_BUS_TYPE_BYTE, (const void**) &u, &n);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || n > 0) {
+ unsigned i;
+
+ if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE))
+ printf("%s=", name);
+
+ for (i = 0; i < n; i++)
+ printf("%02x", u[i]);
+
+ puts("");
+ }
+
+ return 1;
+
+ } else if (streq(contents, "u")) {
+ uint32_t *u;
+ size_t n;
+
+ r = sd_bus_message_read_array(m, SD_BUS_TYPE_UINT32, (const void**) &u, &n);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) || n > 0) {
+ unsigned i;
+
+ if (!FLAGS_SET(flags, BUS_PRINT_PROPERTY_ONLY_VALUE))
+ printf("%s=", name);
+
+ for (i = 0; i < n; i++)
+ printf("%08x", u[i]);
+
+ puts("");
+ }
+
+ return 1;
+ }
+
+ break;
+ }
+
+ return 0;
+}
+
+int bus_message_print_all_properties(
+ sd_bus_message *m,
+ bus_message_print_t func,
+ char **filter,
+ BusPrintPropertyFlags flags,
+ Set **found_properties) {
+
+ int r;
+
+ assert(m);
+
+ r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "{sv}");
+ if (r < 0)
+ return r;
+
+ while ((r = sd_bus_message_enter_container(m, SD_BUS_TYPE_DICT_ENTRY, "sv")) > 0) {
+ _cleanup_free_ char *name_with_equal = NULL;
+ const char *name, *contents, *expected_value = NULL;
+
+ r = sd_bus_message_read_basic(m, SD_BUS_TYPE_STRING, &name);
+ if (r < 0)
+ return r;
+
+ if (found_properties) {
+ r = set_ensure_put(found_properties, &string_hash_ops, name);
+ if (r < 0)
+ return log_oom();
+ }
+
+ name_with_equal = strjoin(name, "=");
+ if (!name_with_equal)
+ return log_oom();
+
+ if (!filter || strv_contains(filter, name) ||
+ (expected_value = strv_find_startswith(filter, name_with_equal))) {
+ r = sd_bus_message_peek_type(m, NULL, &contents);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_enter_container(m, SD_BUS_TYPE_VARIANT, contents);
+ if (r < 0)
+ return r;
+
+ if (func)
+ r = func(name, expected_value, m, flags);
+ if (!func || r == 0)
+ r = bus_print_property(name, expected_value, m, flags);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ if (FLAGS_SET(flags, BUS_PRINT_PROPERTY_SHOW_EMPTY) && !expected_value)
+ printf("%s=[unprintable]\n", name);
+ /* skip what we didn't read */
+ r = sd_bus_message_skip(m, contents);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return r;
+ } else {
+ r = sd_bus_message_skip(m, "v");
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int bus_print_all_properties(
+ sd_bus *bus,
+ const char *dest,
+ const char *path,
+ bus_message_print_t func,
+ char **filter,
+ BusPrintPropertyFlags flags,
+ Set **found_properties) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ int r;
+
+ assert(bus);
+ assert(path);
+
+ r = sd_bus_call_method(bus,
+ dest,
+ path,
+ "org.freedesktop.DBus.Properties",
+ "GetAll",
+ &error,
+ &reply,
+ "s", "");
+ if (r < 0)
+ return r;
+
+ return bus_message_print_all_properties(reply, func, filter, flags, found_properties);
+}
diff --git a/src/shared/bus-print-properties.h b/src/shared/bus-print-properties.h
new file mode 100644
index 0000000..a17875c
--- /dev/null
+++ b/src/shared/bus-print-properties.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-bus.h"
+
+#include "macro.h"
+#include "set.h"
+
+typedef enum BusPrintPropertyFlags {
+ BUS_PRINT_PROPERTY_ONLY_VALUE = 1 << 0, /* e.g. systemctl --value */
+ BUS_PRINT_PROPERTY_SHOW_EMPTY = 1 << 1, /* e.g. systemctl --all */
+} BusPrintPropertyFlags;
+
+typedef int (*bus_message_print_t) (const char *name, const char *expected_value, sd_bus_message *m, BusPrintPropertyFlags flags);
+
+int bus_print_property_value(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *value);
+int bus_print_property_valuef(const char *name, const char *expected_value, BusPrintPropertyFlags flags, const char *fmt, ...) _printf_(4,5);
+int bus_message_print_all_properties(sd_bus_message *m, bus_message_print_t func, char **filter, BusPrintPropertyFlags flags, Set **found_properties);
+int bus_print_all_properties(sd_bus *bus, const char *dest, const char *path, bus_message_print_t func, char **filter, BusPrintPropertyFlags flags, Set **found_properties);
diff --git a/src/shared/bus-unit-procs.c b/src/shared/bus-unit-procs.c
new file mode 100644
index 0000000..8b462b5
--- /dev/null
+++ b/src/shared/bus-unit-procs.c
@@ -0,0 +1,402 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-locator.h"
+#include "bus-unit-procs.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "list.h"
+#include "macro.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "sort-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+
+struct CGroupInfo {
+ char *cgroup_path;
+ bool is_const; /* If false, cgroup_path should be free()'d */
+
+ Hashmap *pids; /* PID → process name */
+ bool done;
+
+ struct CGroupInfo *parent;
+ LIST_FIELDS(struct CGroupInfo, siblings);
+ LIST_HEAD(struct CGroupInfo, children);
+ size_t n_children;
+};
+
+static int add_cgroup(Hashmap *cgroups, const char *path, bool is_const, struct CGroupInfo **ret) {
+ struct CGroupInfo *parent = NULL, *cg;
+ int r;
+
+ assert(cgroups);
+ assert(ret);
+
+ path = empty_to_root(path);
+
+ cg = hashmap_get(cgroups, path);
+ if (cg) {
+ *ret = cg;
+ return 0;
+ }
+
+ if (!empty_or_root(path)) {
+ const char *e, *pp;
+
+ e = strrchr(path, '/');
+ if (!e)
+ return -EINVAL;
+
+ pp = strndupa_safe(path, e - path);
+
+ r = add_cgroup(cgroups, pp, false, &parent);
+ if (r < 0)
+ return r;
+ }
+
+ cg = new0(struct CGroupInfo, 1);
+ if (!cg)
+ return -ENOMEM;
+
+ if (is_const)
+ cg->cgroup_path = (char*) path;
+ else {
+ cg->cgroup_path = strdup(path);
+ if (!cg->cgroup_path) {
+ free(cg);
+ return -ENOMEM;
+ }
+ }
+
+ cg->is_const = is_const;
+ cg->parent = parent;
+
+ r = hashmap_put(cgroups, cg->cgroup_path, cg);
+ if (r < 0) {
+ if (!is_const)
+ free(cg->cgroup_path);
+ free(cg);
+ return r;
+ }
+
+ if (parent) {
+ LIST_PREPEND(siblings, parent->children, cg);
+ parent->n_children++;
+ }
+
+ *ret = cg;
+ return 1;
+}
+
+static int add_process(
+ Hashmap *cgroups,
+ const char *path,
+ pid_t pid,
+ const char *name) {
+
+ struct CGroupInfo *cg;
+ int r;
+
+ assert(cgroups);
+ assert(name);
+ assert(pid > 0);
+
+ r = add_cgroup(cgroups, path, true, &cg);
+ if (r < 0)
+ return r;
+
+ return hashmap_ensure_put(&cg->pids, &trivial_hash_ops, PID_TO_PTR(pid), (void*) name);
+}
+
+static void remove_cgroup(Hashmap *cgroups, struct CGroupInfo *cg) {
+ assert(cgroups);
+ assert(cg);
+
+ while (cg->children)
+ remove_cgroup(cgroups, cg->children);
+
+ hashmap_remove(cgroups, cg->cgroup_path);
+
+ if (!cg->is_const)
+ free(cg->cgroup_path);
+
+ hashmap_free(cg->pids);
+
+ if (cg->parent)
+ LIST_REMOVE(siblings, cg->parent->children, cg);
+
+ free(cg);
+}
+
+static int cgroup_info_compare_func(struct CGroupInfo * const *a, struct CGroupInfo * const *b) {
+ return strcmp((*a)->cgroup_path, (*b)->cgroup_path);
+}
+
+static int dump_processes(
+ Hashmap *cgroups,
+ const char *cgroup_path,
+ const char *prefix,
+ unsigned n_columns,
+ OutputFlags flags) {
+
+ struct CGroupInfo *cg;
+ int r;
+
+ assert(prefix);
+
+ cgroup_path = empty_to_root(cgroup_path);
+
+ cg = hashmap_get(cgroups, cgroup_path);
+ if (!cg)
+ return 0;
+
+ if (!hashmap_isempty(cg->pids)) {
+ const char *name;
+ size_t n = 0, i;
+ pid_t *pids;
+ void *pidp;
+ int width;
+
+ /* Order processes by their PID */
+ pids = newa(pid_t, hashmap_size(cg->pids));
+
+ HASHMAP_FOREACH_KEY(name, pidp, cg->pids)
+ pids[n++] = PTR_TO_PID(pidp);
+
+ assert(n == hashmap_size(cg->pids));
+ typesafe_qsort(pids, n, pid_compare_func);
+
+ width = DECIMAL_STR_WIDTH(pids[n-1]);
+
+ for (i = 0; i < n; i++) {
+ _cleanup_free_ char *e = NULL;
+ const char *special;
+ bool more;
+
+ name = hashmap_get(cg->pids, PID_TO_PTR(pids[i]));
+ assert(name);
+
+ if (n_columns != 0) {
+ unsigned k;
+
+ k = MAX(LESS_BY(n_columns, 2U + width + 1U), 20U);
+
+ e = ellipsize(name, k, 100);
+ if (e)
+ name = e;
+ }
+
+ more = i+1 < n || cg->children;
+ special = special_glyph(more ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT);
+
+ fprintf(stdout, "%s%s%s%*"PID_PRI" %s%s\n",
+ prefix,
+ special,
+ ansi_grey(),
+ width, pids[i],
+ name,
+ ansi_normal());
+ }
+ }
+
+ if (cg->children) {
+ struct CGroupInfo **children;
+ size_t n = 0, i;
+
+ /* Order subcgroups by their name */
+ children = newa(struct CGroupInfo*, cg->n_children);
+ LIST_FOREACH(siblings, child, cg->children)
+ children[n++] = child;
+ assert(n == cg->n_children);
+ typesafe_qsort(children, n, cgroup_info_compare_func);
+
+ if (n_columns != 0)
+ n_columns = MAX(LESS_BY(n_columns, 2U), 20U);
+
+ for (i = 0; i < n; i++) {
+ _cleanup_free_ char *pp = NULL;
+ const char *name, *special;
+ bool more;
+
+ name = strrchr(children[i]->cgroup_path, '/');
+ if (!name)
+ return -EINVAL;
+ name++;
+
+ more = i+1 < n;
+ special = special_glyph(more ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT);
+
+ fputs(prefix, stdout);
+ fputs(special, stdout);
+ fputs(name, stdout);
+ fputc('\n', stdout);
+
+ special = special_glyph(more ? SPECIAL_GLYPH_TREE_VERTICAL : SPECIAL_GLYPH_TREE_SPACE);
+
+ pp = strjoin(prefix, special);
+ if (!pp)
+ return -ENOMEM;
+
+ r = dump_processes(cgroups, children[i]->cgroup_path, pp, n_columns, flags);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ cg->done = true;
+ return 0;
+}
+
+static int dump_extra_processes(
+ Hashmap *cgroups,
+ const char *prefix,
+ unsigned n_columns,
+ OutputFlags flags) {
+
+ _cleanup_free_ pid_t *pids = NULL;
+ _cleanup_hashmap_free_ Hashmap *names = NULL;
+ struct CGroupInfo *cg;
+ size_t n = 0, k;
+ int width, r;
+
+ /* Prints the extra processes, i.e. those that are in cgroups we haven't displayed yet. We show them as
+ * combined, sorted, linear list. */
+
+ HASHMAP_FOREACH(cg, cgroups) {
+ const char *name;
+ void *pidp;
+
+ if (cg->done)
+ continue;
+
+ if (hashmap_isempty(cg->pids))
+ continue;
+
+ r = hashmap_ensure_allocated(&names, &trivial_hash_ops);
+ if (r < 0)
+ return r;
+
+ if (!GREEDY_REALLOC(pids, n + hashmap_size(cg->pids)))
+ return -ENOMEM;
+
+ HASHMAP_FOREACH_KEY(name, pidp, cg->pids) {
+ pids[n++] = PTR_TO_PID(pidp);
+
+ r = hashmap_put(names, pidp, (void*) name);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (n == 0)
+ return 0;
+
+ typesafe_qsort(pids, n, pid_compare_func);
+ width = DECIMAL_STR_WIDTH(pids[n-1]);
+
+ for (k = 0; k < n; k++) {
+ _cleanup_free_ char *e = NULL;
+ const char *name;
+
+ name = hashmap_get(names, PID_TO_PTR(pids[k]));
+ assert(name);
+
+ if (n_columns != 0) {
+ unsigned z;
+
+ z = MAX(LESS_BY(n_columns, 2U + width + 1U), 20U);
+
+ e = ellipsize(name, z, 100);
+ if (e)
+ name = e;
+ }
+
+ fprintf(stdout, "%s%s %*" PID_PRI " %s\n",
+ prefix,
+ special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET),
+ width, pids[k],
+ name);
+ }
+
+ return 0;
+}
+
+int unit_show_processes(
+ sd_bus *bus,
+ const char *unit,
+ const char *cgroup_path,
+ const char *prefix,
+ unsigned n_columns,
+ OutputFlags flags,
+ sd_bus_error *error) {
+
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ Hashmap *cgroups = NULL;
+ struct CGroupInfo *cg;
+ int r;
+
+ assert(bus);
+ assert(unit);
+
+ if (flags & OUTPUT_FULL_WIDTH)
+ n_columns = 0;
+ else if (n_columns <= 0)
+ n_columns = columns();
+
+ prefix = strempty(prefix);
+
+ r = bus_call_method(
+ bus,
+ bus_systemd_mgr,
+ "GetUnitProcesses",
+ error,
+ &reply,
+ "s",
+ unit);
+ if (r < 0)
+ return r;
+
+ cgroups = hashmap_new(&path_hash_ops);
+ if (!cgroups)
+ return -ENOMEM;
+
+ r = sd_bus_message_enter_container(reply, 'a', "(sus)");
+ if (r < 0)
+ goto finish;
+
+ for (;;) {
+ const char *path = NULL, *name = NULL;
+ uint32_t pid;
+
+ r = sd_bus_message_read(reply, "(sus)", &path, &pid, &name);
+ if (r < 0)
+ goto finish;
+ if (r == 0)
+ break;
+
+ r = add_process(cgroups, path, pid, name);
+ if (r == -ENOMEM)
+ goto finish;
+ if (r < 0)
+ log_warning_errno(r, "Invalid process description in GetUnitProcesses reply: cgroup=\"%s\" pid=%u command=\"%s\", ignoring: %m",
+ path, pid, name);
+ }
+
+ r = sd_bus_message_exit_container(reply);
+ if (r < 0)
+ goto finish;
+
+ r = dump_processes(cgroups, cgroup_path, prefix, n_columns, flags);
+ if (r < 0)
+ goto finish;
+
+ r = dump_extra_processes(cgroups, prefix, n_columns, flags);
+
+finish:
+ while ((cg = hashmap_first(cgroups)))
+ remove_cgroup(cgroups, cg);
+
+ hashmap_free(cgroups);
+
+ return r;
+}
diff --git a/src/shared/bus-unit-procs.h b/src/shared/bus-unit-procs.h
new file mode 100644
index 0000000..78c5569
--- /dev/null
+++ b/src/shared/bus-unit-procs.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "output-mode.h"
+
+int unit_show_processes(sd_bus *bus, const char *unit, const char *cgroup_path, const char *prefix, unsigned n_columns, OutputFlags flags, sd_bus_error *error);
diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c
new file mode 100644
index 0000000..50de989
--- /dev/null
+++ b/src/shared/bus-unit-util.c
@@ -0,0 +1,2938 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-unit-util.h"
+#include "bus-util.h"
+#include "cap-list.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "condition.h"
+#include "coredump-util.h"
+#include "cpu-set-util.h"
+#include "dissect-image.h"
+#include "escape.h"
+#include "exec-util.h"
+#include "exit-status.h"
+#include "fileio.h"
+#include "firewall-util.h"
+#include "hexdecoct.h"
+#include "hostname-util.h"
+#include "in-addr-util.h"
+#include "ioprio-util.h"
+#include "ip-protocol-list.h"
+#include "libmount-util.h"
+#include "locale-util.h"
+#include "log.h"
+#include "macro.h"
+#include "missing_fs.h"
+#include "mountpoint-util.h"
+#include "nsflags.h"
+#include "numa-util.h"
+#include "open-file.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "seccomp-util.h"
+#include "securebits-util.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "syslog-util.h"
+#include "terminal-util.h"
+#include "unit-def.h"
+#include "user-util.h"
+#include "utf8.h"
+
+int bus_parse_unit_info(sd_bus_message *message, UnitInfo *u) {
+ assert(message);
+ assert(u);
+
+ u->machine = NULL;
+
+ return sd_bus_message_read(
+ message,
+ "(ssssssouso)",
+ &u->id,
+ &u->description,
+ &u->load_state,
+ &u->active_state,
+ &u->sub_state,
+ &u->following,
+ &u->unit_path,
+ &u->job_id,
+ &u->job_type,
+ &u->job_path);
+}
+
+#define DEFINE_BUS_APPEND_PARSE_PTR(bus_type, cast_type, type, parse_func) \
+ static int bus_append_##parse_func( \
+ sd_bus_message *m, \
+ const char *field, \
+ const char *eq) { \
+ type val; \
+ int r; \
+ \
+ r = parse_func(eq, &val); \
+ if (r < 0) \
+ return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq); \
+ \
+ r = sd_bus_message_append(m, "(sv)", field, \
+ bus_type, (cast_type) val); \
+ if (r < 0) \
+ return bus_log_create_error(r); \
+ \
+ return 1; \
+ }
+
+#define DEFINE_BUS_APPEND_PARSE(bus_type, parse_func) \
+ static int bus_append_##parse_func( \
+ sd_bus_message *m, \
+ const char *field, \
+ const char *eq) { \
+ int r; \
+ \
+ r = parse_func(eq); \
+ if (r < 0) \
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s: %s", field, eq); \
+ \
+ r = sd_bus_message_append(m, "(sv)", field, \
+ bus_type, (int32_t) r); \
+ if (r < 0) \
+ return bus_log_create_error(r); \
+ \
+ return 1; \
+ }
+
+DEFINE_BUS_APPEND_PARSE("b", parse_boolean);
+DEFINE_BUS_APPEND_PARSE("i", ioprio_class_from_string);
+DEFINE_BUS_APPEND_PARSE("i", ip_tos_from_string);
+DEFINE_BUS_APPEND_PARSE("i", log_facility_unshifted_from_string);
+DEFINE_BUS_APPEND_PARSE("i", log_level_from_string);
+DEFINE_BUS_APPEND_PARSE("i", seccomp_parse_errno_or_action);
+DEFINE_BUS_APPEND_PARSE("i", sched_policy_from_string);
+DEFINE_BUS_APPEND_PARSE("i", secure_bits_from_string);
+DEFINE_BUS_APPEND_PARSE("i", signal_from_string);
+DEFINE_BUS_APPEND_PARSE("i", parse_ip_protocol);
+DEFINE_BUS_APPEND_PARSE_PTR("i", int32_t, int, ioprio_parse_priority);
+DEFINE_BUS_APPEND_PARSE_PTR("i", int32_t, int, parse_nice);
+DEFINE_BUS_APPEND_PARSE_PTR("i", int32_t, int, safe_atoi);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, nsec_t, parse_nsec);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_blkio_weight_parse);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_cpu_shares_parse);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_weight_parse);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, cg_cpu_weight_parse);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, unsigned long, mount_propagation_flag_from_string);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, safe_atou64);
+DEFINE_BUS_APPEND_PARSE_PTR("u", uint32_t, mode_t, parse_mode);
+DEFINE_BUS_APPEND_PARSE_PTR("u", uint32_t, unsigned, safe_atou);
+DEFINE_BUS_APPEND_PARSE_PTR("x", int64_t, int64_t, safe_atoi64);
+DEFINE_BUS_APPEND_PARSE_PTR("t", uint64_t, uint64_t, coredump_filter_mask_from_string);
+
+static int bus_append_string(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ r = sd_bus_message_append(m, "(sv)", field, "s", eq);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_strv(sd_bus_message *m, const char *field, const char *eq, ExtractFlags flags) {
+ const char *p;
+ int r;
+
+ r = sd_bus_message_open_container(m, 'r', "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, 's', field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "as");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "s");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (p = eq;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, flags);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Invalid syntax: %s", eq);
+
+ r = sd_bus_message_append_basic(m, 's', word);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_byte_array(sd_bus_message *m, const char *field, const void *buf, size_t n) {
+ int r;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "ay");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_array(m, 'y', buf, n);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_parse_sec_rename(sd_bus_message *m, const char *field, const char *eq) {
+ char *n;
+ usec_t t;
+ size_t l;
+ int r;
+
+ r = parse_sec(eq, &t);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq);
+
+ l = strlen(field);
+ n = newa(char, l + 2);
+ /* Change suffix Sec → USec */
+ strcpy(mempcpy(n, field, l - 3), "USec");
+
+ r = sd_bus_message_append(m, "(sv)", n, "t", t);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_parse_size(sd_bus_message *m, const char *field, const char *eq, uint64_t base) {
+ uint64_t v;
+ int r;
+
+ r = parse_size(eq, base, &v);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq);
+
+ r = sd_bus_message_append(m, "(sv)", field, "t", v);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_exec_command(sd_bus_message *m, const char *field, const char *eq) {
+ bool explicit_path = false, done = false;
+ _cleanup_strv_free_ char **l = NULL, **ex_opts = NULL;
+ _cleanup_free_ char *path = NULL, *upgraded_name = NULL;
+ ExecCommandFlags flags = 0;
+ bool is_ex_prop = endswith(field, "Ex");
+ int r;
+
+ do {
+ switch (*eq) {
+
+ case '-':
+ if (FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE))
+ done = true;
+ else {
+ flags |= EXEC_COMMAND_IGNORE_FAILURE;
+ eq++;
+ }
+ break;
+
+ case '@':
+ if (explicit_path)
+ done = true;
+ else {
+ explicit_path = true;
+ eq++;
+ }
+ break;
+
+ case ':':
+ if (FLAGS_SET(flags, EXEC_COMMAND_NO_ENV_EXPAND))
+ done = true;
+ else {
+ flags |= EXEC_COMMAND_NO_ENV_EXPAND;
+ eq++;
+ }
+ break;
+
+ case '+':
+ if (flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))
+ done = true;
+ else {
+ flags |= EXEC_COMMAND_FULLY_PRIVILEGED;
+ eq++;
+ }
+ break;
+
+ case '!':
+ if (flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_AMBIENT_MAGIC))
+ done = true;
+ else if (FLAGS_SET(flags, EXEC_COMMAND_NO_SETUID)) {
+ flags &= ~EXEC_COMMAND_NO_SETUID;
+ flags |= EXEC_COMMAND_AMBIENT_MAGIC;
+ eq++;
+ } else {
+ flags |= EXEC_COMMAND_NO_SETUID;
+ eq++;
+ }
+ break;
+
+ default:
+ done = true;
+ break;
+ }
+ } while (!done);
+
+ if (!is_ex_prop && (flags & (EXEC_COMMAND_NO_ENV_EXPAND|EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID|EXEC_COMMAND_AMBIENT_MAGIC))) {
+ /* Upgrade the ExecXYZ= property to ExecXYZEx= for convenience */
+ is_ex_prop = true;
+ upgraded_name = strjoin(field, "Ex");
+ if (!upgraded_name)
+ return log_oom();
+ }
+
+ if (is_ex_prop) {
+ r = exec_command_flags_to_strv(flags, &ex_opts);
+ if (r < 0)
+ return log_error_errno(r, "Failed to convert ExecCommandFlags to strv: %m");
+ }
+
+ if (explicit_path) {
+ r = extract_first_word(&eq, &path, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse path: %m");
+ }
+
+ r = strv_split_full(&l, eq, NULL, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse command line: %m");
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, upgraded_name ?: field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', is_ex_prop ? "a(sasas)" : "a(sasb)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', is_ex_prop ? "(sasas)" : "(sasb)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ if (!strv_isempty(l)) {
+
+ r = sd_bus_message_open_container(m, 'r', is_ex_prop ? "sasas" : "sasb");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "s", path ?: l[0]);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_strv(m, l);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = is_ex_prop ? sd_bus_message_append_strv(m, ex_opts) : sd_bus_message_append(m, "b", FLAGS_SET(flags, EXEC_COMMAND_IGNORE_FAILURE));
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_open_file(sd_bus_message *m, const char *field, const char *eq) {
+ _cleanup_(open_file_freep) OpenFile *of = NULL;
+ int r;
+
+ assert(m);
+
+ r = open_file_parse(eq, &of);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse OpenFile= setting: %m");
+
+ r = sd_bus_message_append(m, "(sv)", field, "a(sst)", (size_t) 1, of->path, of->fdname, of->flags);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_ip_address_access(sd_bus_message *m, int family, const union in_addr_union *prefix, unsigned char prefixlen) {
+ int r;
+
+ assert(m);
+ assert(prefix);
+
+ r = sd_bus_message_open_container(m, 'r', "iayu");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "i", family);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append_array(m, 'y', prefix, FAMILY_ADDRESS_SIZE(family));
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_append(m, "u", prefixlen);
+ if (r < 0)
+ return r;
+
+ return sd_bus_message_close_container(m);
+}
+
+static int bus_append_nft_set(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ assert(m);
+ assert(field);
+ assert(eq);
+
+ if (isempty(eq)) {
+ r = sd_bus_message_append(m, "(sv)", field, "a(iiss)", 0);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(iiss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(iiss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (const char *p = eq;;) {
+ _cleanup_free_ char *tuple = NULL, *source_str = NULL, *nfproto_str = NULL, *table = NULL, *set = NULL;
+ const char *q = NULL;
+ int source, nfproto;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s: %m", field);
+ if (r == 0)
+ break;
+ if (isempty(tuple))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field);
+
+ q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE, &source_str, &nfproto_str, &table, &set, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r != 4 || !isempty(q))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field);
+
+ assert(source_str);
+ assert(nfproto_str);
+ assert(table);
+ assert(set);
+
+ source = nft_set_source_from_string(source_str);
+ if (!IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field);
+
+ nfproto = nfproto_from_string(nfproto_str);
+ if (nfproto < 0 || !nft_identifier_valid(table) || !nft_identifier_valid(set))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse %s", field);
+
+ r = sd_bus_message_append(m, "(iiss)", source, nfproto, table, set);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+}
+
+static int bus_append_cgroup_property(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ if (STR_IN_SET(field, "DevicePolicy",
+ "Slice",
+ "ManagedOOMSwap",
+ "ManagedOOMMemoryPressure",
+ "ManagedOOMPreference",
+ "MemoryPressureWatch",
+ "DelegateSubgroup"))
+ return bus_append_string(m, field, eq);
+
+ if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {
+ r = parse_permyriad(eq);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+ /* Pass around scaled to 2^32-1 == 100% */
+ r = sd_bus_message_append(m, "(sv)", field, "u", UINT32_SCALE_FROM_PERMYRIAD(r));
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "CPUAccounting",
+ "MemoryAccounting",
+ "IOAccounting",
+ "BlockIOAccounting",
+ "TasksAccounting",
+ "IPAccounting",
+ "CoredumpReceive"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "CPUWeight",
+ "StartupCPUWeight"))
+ return bus_append_cg_cpu_weight_parse(m, field, eq);
+
+ if (STR_IN_SET(field, "IOWeight",
+ "StartupIOWeight"))
+ return bus_append_cg_weight_parse(m, field, eq);
+
+ if (STR_IN_SET(field, "CPUShares",
+ "StartupCPUShares"))
+ return bus_append_cg_cpu_shares_parse(m, field, eq);
+
+ if (STR_IN_SET(field, "AllowedCPUs",
+ "StartupAllowedCPUs",
+ "AllowedMemoryNodes",
+ "StartupAllowedMemoryNodes")) {
+ _cleanup_(cpu_set_reset) CPUSet cpuset = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ r = parse_cpu_set(eq, &cpuset);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+ r = cpu_set_to_dbus(&cpuset, &array, &allocated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to serialize CPUSet: %m");
+
+ return bus_append_byte_array(m, field, array, allocated);
+ }
+
+ if (STR_IN_SET(field, "BlockIOWeight",
+ "StartupBlockIOWeight"))
+ return bus_append_cg_blkio_weight_parse(m, field, eq);
+
+ if (streq(field, "DisableControllers"))
+ return bus_append_strv(m, "DisableControllers", eq, EXTRACT_UNQUOTE);
+
+ if (streq(field, "Delegate")) {
+ r = parse_boolean(eq);
+ if (r < 0)
+ return bus_append_strv(m, "DelegateControllers", eq, EXTRACT_UNQUOTE);
+
+ r = sd_bus_message_append(m, "(sv)", "Delegate", "b", r);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "MemoryMin",
+ "DefaultMemoryLow",
+ "DefaultMemoryMin",
+ "MemoryLow",
+ "MemoryHigh",
+ "MemoryMax",
+ "MemorySwapMax",
+ "MemoryZSwapMax",
+ "MemoryLimit",
+ "TasksMax")) {
+
+ if (streq(eq, "infinity")) {
+ r = sd_bus_message_append(m, "(sv)", field, "t", CGROUP_LIMIT_MAX);
+ if (r < 0)
+ return bus_log_create_error(r);
+ return 1;
+ } else if (isempty(eq)) {
+ uint64_t empty_value = STR_IN_SET(field,
+ "DefaultMemoryLow",
+ "DefaultMemoryMin",
+ "MemoryLow",
+ "MemoryMin") ?
+ CGROUP_LIMIT_MIN :
+ CGROUP_LIMIT_MAX;
+
+ r = sd_bus_message_append(m, "(sv)", field, "t", empty_value);
+ if (r < 0)
+ return bus_log_create_error(r);
+ return 1;
+ }
+
+ r = parse_permyriad(eq);
+ if (r >= 0) {
+ char *n;
+
+ /* When this is a percentage we'll convert this into a relative value in the range 0…UINT32_MAX
+ * and pass it in the MemoryLowScale property (and related ones). This way the physical memory
+ * size can be determined server-side. */
+
+ n = strjoina(field, "Scale");
+ r = sd_bus_message_append(m, "(sv)", n, "u", UINT32_SCALE_FROM_PERMYRIAD(r));
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "TasksMax"))
+ return bus_append_safe_atou64(m, field, eq);
+
+ return bus_append_parse_size(m, field, eq, 1024);
+ }
+
+ if (streq(field, "CPUQuota")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", "CPUQuotaPerSecUSec", "t", USEC_INFINITY);
+ else {
+ r = parse_permyriad_unbounded(eq);
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(ERANGE),
+ "CPU quota too small.");
+ if (r < 0)
+ return log_error_errno(r, "CPU quota '%s' invalid.", eq);
+
+ r = sd_bus_message_append(m, "(sv)", "CPUQuotaPerSecUSec", "t", (((uint64_t) r * USEC_PER_SEC) / 10000U));
+ }
+
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "CPUQuotaPeriodSec")) {
+ usec_t u = USEC_INFINITY;
+
+ r = parse_sec_def_infinity(eq, &u);
+ if (r < 0)
+ return log_error_errno(r, "CPU quota period '%s' invalid.", eq);
+
+ r = sd_bus_message_append(m, "(sv)", "CPUQuotaPeriodUSec", "t", u);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "DeviceAllow")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 0);
+ else {
+ const char *path = eq, *rwm = NULL, *e;
+
+ e = strchr(eq, ' ');
+ if (e) {
+ path = strndupa_safe(eq, e - eq);
+ rwm = e+1;
+ }
+
+ r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 1, path, strempty(rwm));
+ }
+
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (cgroup_io_limit_type_from_string(field) >= 0 || STR_IN_SET(field, "BlockIOReadBandwidth", "BlockIOWriteBandwidth")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "a(st)", 0);
+ else {
+ const char *path, *bandwidth, *e;
+ uint64_t bytes;
+
+ e = strchr(eq, ' ');
+ if (!e)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse %s value %s.",
+ field, eq);
+
+ path = strndupa_safe(eq, e - eq);
+ bandwidth = e+1;
+
+ if (streq(bandwidth, "infinity"))
+ bytes = CGROUP_LIMIT_MAX;
+ else {
+ r = parse_size(bandwidth, 1000, &bytes);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse byte value %s: %m", bandwidth);
+ }
+
+ r = sd_bus_message_append(m, "(sv)", field, "a(st)", 1, path, bytes);
+ }
+
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "IODeviceWeight",
+ "BlockIODeviceWeight")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "a(st)", 0);
+ else {
+ const char *path, *weight, *e;
+ uint64_t u;
+
+ e = strchr(eq, ' ');
+ if (!e)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse %s value %s.",
+ field, eq);
+
+ path = strndupa_safe(eq, e - eq);
+ weight = e+1;
+
+ r = safe_atou64(weight, &u);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value %s: %m", field, weight);
+
+ r = sd_bus_message_append(m, "(sv)", field, "a(st)", 1, path, u);
+ }
+
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "IODeviceLatencyTargetSec")) {
+ const char *field_usec = "IODeviceLatencyTargetUSec";
+
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field_usec, "a(st)", USEC_INFINITY);
+ else {
+ const char *path, *target, *e;
+ usec_t usec;
+
+ e = strchr(eq, ' ');
+ if (!e)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse %s value %s.",
+ field, eq);
+
+ path = strndupa_safe(eq, e - eq);
+ target = e+1;
+
+ r = parse_sec(target, &usec);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value %s: %m", field, target);
+
+ r = sd_bus_message_append(m, "(sv)", field_usec, "a(st)", 1, path, usec);
+ }
+
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "IPAddressAllow",
+ "IPAddressDeny")) {
+ unsigned char prefixlen;
+ union in_addr_union prefix = {};
+ int family;
+
+ if (isempty(eq)) {
+ r = sd_bus_message_append(m, "(sv)", field, "a(iayu)", 0);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(iayu)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(iayu)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ if (streq(eq, "any")) {
+ /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */
+
+ r = bus_append_ip_address_access(m, AF_INET, &prefix, 0);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = bus_append_ip_address_access(m, AF_INET6, &prefix, 0);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ } else if (is_localhost(eq)) {
+ /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */
+
+ prefix.in.s_addr = htobe32(0x7f000000);
+ r = bus_append_ip_address_access(m, AF_INET, &prefix, 8);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ prefix.in6 = (struct in6_addr) IN6ADDR_LOOPBACK_INIT;
+ r = bus_append_ip_address_access(m, AF_INET6, &prefix, 128);
+ if (r < 0)
+ return r;
+
+ } else if (streq(eq, "link-local")) {
+ /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */
+
+ prefix.in.s_addr = htobe32((UINT32_C(169) << 24 | UINT32_C(254) << 16));
+ r = bus_append_ip_address_access(m, AF_INET, &prefix, 16);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ prefix.in6 = (struct in6_addr) {
+ .s6_addr32[0] = htobe32(0xfe800000)
+ };
+ r = bus_append_ip_address_access(m, AF_INET6, &prefix, 64);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ } else if (streq(eq, "multicast")) {
+ /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */
+
+ prefix.in.s_addr = htobe32((UINT32_C(224) << 24));
+ r = bus_append_ip_address_access(m, AF_INET, &prefix, 4);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ prefix.in6 = (struct in6_addr) {
+ .s6_addr32[0] = htobe32(0xff000000)
+ };
+ r = bus_append_ip_address_access(m, AF_INET6, &prefix, 8);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ } else {
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&eq, &word, NULL, 0);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s: %s", field, eq);
+
+ r = in_addr_prefix_from_string_auto(word, &family, &prefix, &prefixlen);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse IP address prefix: %s", word);
+
+ r = bus_append_ip_address_access(m, family, &prefix, prefixlen);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "IPIngressFilterPath",
+ "IPEgressFilterPath")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "as", 0);
+ else
+ r = sd_bus_message_append(m, "(sv)", field, "as", 1, eq);
+
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "BPFProgram")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 0);
+ else {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&eq, &word, ":", 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s: %m", field);
+
+ r = sd_bus_message_append(m, "(sv)", field, "a(ss)", 1, word, eq);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "SocketBindAllow",
+ "SocketBindDeny")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "a(iiqq)", 0);
+ else {
+ int32_t family, ip_protocol;
+ uint16_t nr_ports, port_min;
+
+ r = parse_socket_bind_item(eq, &family, &ip_protocol, &nr_ports, &port_min);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s", field);
+
+ r = sd_bus_message_append(
+ m, "(sv)", field, "a(iiqq)", 1, family, ip_protocol, nr_ports, port_min);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "MemoryPressureThresholdSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (streq(field, "NFTSet"))
+ return bus_append_nft_set(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_automount_property(sd_bus_message *m, const char *field, const char *eq) {
+ if (STR_IN_SET(field, "Where",
+ "ExtraOptions"))
+ return bus_append_string(m, field, eq);
+
+ if (streq(field, "DirectoryMode"))
+ return bus_append_parse_mode(m, field, eq);
+
+ if (streq(field, "TimeoutIdleSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_execute_property(sd_bus_message *m, const char *field, const char *eq) {
+ const char *suffix;
+ int r;
+
+ if (STR_IN_SET(field, "User",
+ "Group",
+ "UtmpIdentifier",
+ "UtmpMode",
+ "PAMName",
+ "TTYPath",
+ "WorkingDirectory",
+ "RootDirectory",
+ "SyslogIdentifier",
+ "ProtectSystem",
+ "ProtectHome",
+ "SELinuxContext",
+ "RootImage",
+ "RootVerity",
+ "RuntimeDirectoryPreserve",
+ "Personality",
+ "KeyringMode",
+ "ProtectProc",
+ "ProcSubset",
+ "NetworkNamespacePath",
+ "IPCNamespacePath",
+ "LogNamespace",
+ "RootImagePolicy",
+ "MountImagePolicy",
+ "ExtensionImagePolicy"))
+ return bus_append_string(m, field, eq);
+
+ if (STR_IN_SET(field, "IgnoreSIGPIPE",
+ "TTYVHangup",
+ "TTYReset",
+ "TTYVTDisallocate",
+ "PrivateTmp",
+ "PrivateDevices",
+ "PrivateNetwork",
+ "PrivateUsers",
+ "PrivateMounts",
+ "PrivateIPC",
+ "NoNewPrivileges",
+ "SyslogLevelPrefix",
+ "MemoryDenyWriteExecute",
+ "RestrictRealtime",
+ "DynamicUser",
+ "RemoveIPC",
+ "ProtectKernelTunables",
+ "ProtectKernelModules",
+ "ProtectKernelLogs",
+ "ProtectClock",
+ "ProtectControlGroups",
+ "MountAPIVFS",
+ "CPUSchedulingResetOnFork",
+ "LockPersonality",
+ "ProtectHostname",
+ "MemoryKSM",
+ "RestrictSUIDSGID",
+ "RootEphemeral",
+ "SetLoginEnvironment"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "ReadWriteDirectories",
+ "ReadOnlyDirectories",
+ "InaccessibleDirectories",
+ "ReadWritePaths",
+ "ReadOnlyPaths",
+ "InaccessiblePaths",
+ "ExecPaths",
+ "NoExecPaths",
+ "ExecSearchPath",
+ "ExtensionDirectories",
+ "ConfigurationDirectory",
+ "SupplementaryGroups",
+ "SystemCallArchitectures"))
+ return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE);
+
+ if (STR_IN_SET(field, "SyslogLevel",
+ "LogLevelMax"))
+ return bus_append_log_level_from_string(m, field, eq);
+
+ if (streq(field, "SyslogFacility"))
+ return bus_append_log_facility_unshifted_from_string(m, field, eq);
+
+ if (streq(field, "SecureBits"))
+ return bus_append_secure_bits_from_string(m, field, eq);
+
+ if (streq(field, "CPUSchedulingPolicy"))
+ return bus_append_sched_policy_from_string(m, field, eq);
+
+ if (STR_IN_SET(field, "CPUSchedulingPriority",
+ "OOMScoreAdjust"))
+ return bus_append_safe_atoi(m, field, eq);
+
+ if (streq(field, "CoredumpFilter"))
+ return bus_append_coredump_filter_mask_from_string(m, field, eq);
+
+ if (streq(field, "Nice"))
+ return bus_append_parse_nice(m, field, eq);
+
+ if (streq(field, "SystemCallErrorNumber"))
+ return bus_append_seccomp_parse_errno_or_action(m, field, eq);
+
+ if (streq(field, "IOSchedulingClass"))
+ return bus_append_ioprio_class_from_string(m, field, eq);
+
+ if (streq(field, "IOSchedulingPriority"))
+ return bus_append_ioprio_parse_priority(m, field, eq);
+
+ if (STR_IN_SET(field, "RuntimeDirectoryMode",
+ "StateDirectoryMode",
+ "CacheDirectoryMode",
+ "LogsDirectoryMode",
+ "ConfigurationDirectoryMode",
+ "UMask"))
+ return bus_append_parse_mode(m, field, eq);
+
+ if (streq(field, "TimerSlackNSec"))
+ return bus_append_parse_nsec(m, field, eq);
+
+ if (streq(field, "LogRateLimitIntervalSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (STR_IN_SET(field, "LogRateLimitBurst",
+ "TTYRows",
+ "TTYColumns"))
+ return bus_append_safe_atou(m, field, eq);
+
+ if (streq(field, "MountFlags"))
+ return bus_append_mount_propagation_flag_from_string(m, field, eq);
+
+ if (STR_IN_SET(field, "Environment",
+ "UnsetEnvironment",
+ "PassEnvironment"))
+ return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE|EXTRACT_CUNESCAPE);
+
+ if (streq(field, "EnvironmentFile")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", "EnvironmentFiles", "a(sb)", 0);
+ else
+ r = sd_bus_message_append(m, "(sv)", "EnvironmentFiles", "a(sb)", 1,
+ eq[0] == '-' ? eq + 1 : eq,
+ eq[0] == '-');
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "SetCredential", "SetCredentialEncrypted")) {
+ r = sd_bus_message_open_container(m, 'r', "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, 's', field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(say)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "a(say)", 0);
+ else {
+ _cleanup_free_ char *word = NULL;
+ const char *p = eq;
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s= parameter: %s", field, eq);
+ if (r == 0 || !p)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing argument to %s=.", field);
+
+ r = sd_bus_message_open_container(m, 'a', "(say)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'r', "say");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "s", word);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ if (streq(field, "SetCredentialEncrypted")) {
+ _cleanup_free_ void *decoded = NULL;
+ size_t decoded_size;
+
+ r = unbase64mem(p, SIZE_MAX, &decoded, &decoded_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to base64 decode encrypted credential: %m");
+
+ r = sd_bus_message_append_array(m, 'y', decoded, decoded_size);
+ } else {
+ _cleanup_free_ char *unescaped = NULL;
+ ssize_t l;
+
+ l = cunescape(p, UNESCAPE_ACCEPT_NUL, &unescaped);
+ if (l < 0)
+ return log_error_errno(l, "Failed to unescape %s= value: %s", field, p);
+
+ r = sd_bus_message_append_array(m, 'y', unescaped, l);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "LoadCredential", "LoadCredentialEncrypted")) {
+ r = sd_bus_message_open_container(m, 'r', "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, 's', field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "a(ss)", 0);
+ else {
+ _cleanup_free_ char *word = NULL;
+ const char *p = eq;
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s= parameter: %s", field, eq);
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing argument to %s=.", field);
+
+ if (isempty(p)) /* If only one field is specified, then this means "inherit from above" */
+ p = eq;
+
+ r = sd_bus_message_append(m, "a(ss)", 1, word, p);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "ImportCredential")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "as", 0);
+ else
+ r = sd_bus_message_append(m, "(sv)", field, "as", 1, eq);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "LogExtraFields")) {
+ r = sd_bus_message_open_container(m, 'r', "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, 's', "LogExtraFields");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "aay");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "ay");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_array(m, 'y', eq, strlen(eq));
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "LogFilterPatterns")) {
+ r = sd_bus_message_append(m, "(sv)", "LogFilterPatterns", "a(bs)", 1,
+ eq[0] != '~',
+ eq[0] != '~' ? eq : eq + 1);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "StandardInput",
+ "StandardOutput",
+ "StandardError")) {
+ const char *n, *appended;
+
+ if ((n = startswith(eq, "fd:"))) {
+ appended = strjoina(field, "FileDescriptorName");
+ r = sd_bus_message_append(m, "(sv)", appended, "s", n);
+ } else if ((n = startswith(eq, "file:"))) {
+ appended = strjoina(field, "File");
+ r = sd_bus_message_append(m, "(sv)", appended, "s", n);
+ } else if ((n = startswith(eq, "append:"))) {
+ appended = strjoina(field, "FileToAppend");
+ r = sd_bus_message_append(m, "(sv)", appended, "s", n);
+ } else if ((n = startswith(eq, "truncate:"))) {
+ appended = strjoina(field, "FileToTruncate");
+ r = sd_bus_message_append(m, "(sv)", appended, "s", n);
+ } else
+ r = sd_bus_message_append(m, "(sv)", field, "s", eq);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "StandardInputText")) {
+ _cleanup_free_ char *unescaped = NULL;
+ ssize_t l;
+
+ l = cunescape(eq, 0, &unescaped);
+ if (l < 0)
+ return log_error_errno(l, "Failed to unescape text '%s': %m", eq);
+
+ if (!strextend(&unescaped, "\n"))
+ return log_oom();
+
+ /* Note that we don't expand specifiers here, but that should be OK, as this is a
+ * programmatic interface anyway */
+
+ return bus_append_byte_array(m, field, unescaped, l + 1);
+ }
+
+ if (streq(field, "StandardInputData")) {
+ _cleanup_free_ void *decoded = NULL;
+ size_t sz;
+
+ r = unbase64mem(eq, SIZE_MAX, &decoded, &sz);
+ if (r < 0)
+ return log_error_errno(r, "Failed to decode base64 data '%s': %m", eq);
+
+ return bus_append_byte_array(m, field, decoded, sz);
+ }
+
+ if ((suffix = startswith(field, "Limit"))) {
+ int rl;
+
+ rl = rlimit_from_string(suffix);
+ if (rl >= 0) {
+ const char *sn;
+ struct rlimit l;
+
+ r = rlimit_parse(rl, eq, &l);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse resource limit: %s", eq);
+
+ r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) l.rlim_max);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ sn = strjoina(field, "Soft");
+ r = sd_bus_message_append(m, "(sv)", sn, "t", (uint64_t) l.rlim_cur);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+ }
+
+ if (STR_IN_SET(field, "AppArmorProfile",
+ "SmackProcessLabel")) {
+ int ignore = 0;
+ const char *s = eq;
+
+ if (eq[0] == '-') {
+ ignore = 1;
+ s = eq + 1;
+ }
+
+ r = sd_bus_message_append(m, "(sv)", field, "(bs)", ignore, s);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "CapabilityBoundingSet",
+ "AmbientCapabilities")) {
+ uint64_t sum = 0;
+ bool invert = false;
+ const char *p = eq;
+
+ if (*p == '~') {
+ invert = true;
+ p++;
+ }
+
+ r = capability_set_from_string(p, &sum);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value %s: %m", field, eq);
+
+ sum = invert ? ~sum : sum;
+
+ r = sd_bus_message_append(m, "(sv)", field, "t", sum);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "CPUAffinity")) {
+ _cleanup_(cpu_set_reset) CPUSet cpuset = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ if (eq && streq(eq, "numa")) {
+ r = sd_bus_message_append(m, "(sv)", "CPUAffinityFromNUMA", "b", true);
+ if (r < 0)
+ return bus_log_create_error(r);
+ return r;
+ }
+
+ r = parse_cpu_set(eq, &cpuset);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+ r = cpu_set_to_dbus(&cpuset, &array, &allocated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to serialize CPUAffinity: %m");
+
+ return bus_append_byte_array(m, field, array, allocated);
+ }
+
+ if (streq(field, "NUMAPolicy")) {
+ r = mpol_from_string(eq);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+
+ r = sd_bus_message_append(m, "(sv)", field, "i", (int32_t) r);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "NUMAMask")) {
+ _cleanup_(cpu_set_reset) CPUSet nodes = {};
+ _cleanup_free_ uint8_t *array = NULL;
+ size_t allocated;
+
+ if (eq && streq(eq, "all")) {
+ r = numa_mask_add_all(&nodes);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create NUMA mask representing \"all\" NUMA nodes: %m");
+ } else {
+ r = parse_cpu_set(eq, &nodes);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value: %s", field, eq);
+ }
+
+ r = cpu_set_to_dbus(&nodes, &array, &allocated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to serialize NUMAMask: %m");
+
+ return bus_append_byte_array(m, field, array, allocated);
+ }
+
+ if (STR_IN_SET(field, "RestrictAddressFamilies",
+ "RestrictFileSystems",
+ "SystemCallFilter",
+ "SystemCallLog",
+ "RestrictNetworkInterfaces")) {
+ int allow_list = 1;
+ const char *p = eq;
+
+ if (*p == '~') {
+ allow_list = 0;
+ p++;
+ }
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "(bas)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'r', "bas");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, 'b', &allow_list);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "s");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Invalid syntax: %s", eq);
+
+ r = sd_bus_message_append_basic(m, 's', word);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "RestrictNamespaces")) {
+ bool invert = false;
+ unsigned long flags;
+
+ r = parse_boolean(eq);
+ if (r > 0)
+ flags = 0;
+ else if (r == 0)
+ flags = NAMESPACE_FLAGS_ALL;
+ else {
+ if (eq[0] == '~') {
+ invert = true;
+ eq++;
+ }
+
+ r = namespace_flags_from_string(eq, &flags);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s value %s.", field, eq);
+ }
+
+ if (invert)
+ flags = (~flags) & NAMESPACE_FLAGS_ALL;
+
+ r = sd_bus_message_append(m, "(sv)", field, "t", (uint64_t) flags);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "BindPaths",
+ "BindReadOnlyPaths")) {
+ const char *p = eq;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(ssbt)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(ssbt)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *source = NULL, *destination = NULL;
+ char *s = NULL, *d = NULL;
+ bool ignore_enoent = false;
+ uint64_t flags = MS_REC;
+
+ r = extract_first_word(&p, &source, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+ if (r == 0)
+ break;
+
+ s = source;
+ if (s[0] == '-') {
+ ignore_enoent = true;
+ s++;
+ }
+
+ if (p && p[-1] == ':') {
+ r = extract_first_word(&p, &destination, ":" WHITESPACE, EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Missing argument after ':': %s",
+ eq);
+
+ d = destination;
+
+ if (p && p[-1] == ':') {
+ _cleanup_free_ char *options = NULL;
+
+ r = extract_first_word(&p, &options, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+
+ if (isempty(options) || streq(options, "rbind"))
+ flags = MS_REC;
+ else if (streq(options, "norbind"))
+ flags = 0;
+ else
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Unknown options: %s",
+ eq);
+ }
+ } else
+ d = s;
+
+ r = sd_bus_message_append(m, "(ssbt)", s, d, ignore_enoent, flags);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "TemporaryFileSystem")) {
+ const char *p = eq;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *path = NULL;
+ const char *w;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+ if (r == 0)
+ break;
+
+ w = word;
+ r = extract_first_word(&w, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse argument: %s",
+ p);
+
+ r = sd_bus_message_append(m, "(ss)", path, w);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "RootHash")) {
+ _cleanup_free_ void *roothash_decoded = NULL;
+ size_t roothash_decoded_size = 0;
+
+ /* We have the path to a roothash to load and decode, eg: RootHash=/foo/bar.roothash */
+ if (path_is_absolute(eq))
+ return bus_append_string(m, "RootHashPath", eq);
+
+ /* We have a roothash to decode, eg: RootHash=012345789abcdef */
+ r = unhexmem(eq, strlen(eq), &roothash_decoded, &roothash_decoded_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to decode RootHash= '%s': %m", eq);
+ if (roothash_decoded_size < sizeof(sd_id128_t))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "RootHash= '%s' is too short: %m", eq);
+
+ return bus_append_byte_array(m, field, roothash_decoded, roothash_decoded_size);
+ }
+
+ if (streq(field, "RootHashSignature")) {
+ _cleanup_free_ void *roothash_sig_decoded = NULL;
+ char *value;
+ size_t roothash_sig_decoded_size = 0;
+
+ /* We have the path to a roothash signature to load and decode, eg: RootHash=/foo/bar.roothash.p7s */
+ if (path_is_absolute(eq))
+ return bus_append_string(m, "RootHashSignaturePath", eq);
+
+ if (!(value = startswith(eq, "base64:")))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to decode RootHashSignature= '%s', not a path but doesn't start with 'base64:': %m", eq);
+
+ /* We have a roothash signature to decode, eg: RootHashSignature=base64:012345789abcdef */
+ r = unbase64mem(value, strlen(value), &roothash_sig_decoded, &roothash_sig_decoded_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to decode RootHashSignature= '%s': %m", eq);
+
+ return bus_append_byte_array(m, field, roothash_sig_decoded, roothash_sig_decoded_size);
+ }
+
+ if (streq(field, "RootImageOptions")) {
+ _cleanup_strv_free_ char **l = NULL;
+ const char *p = eq;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = strv_split_colon_pairs(&l, p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+
+ STRV_FOREACH_PAIR(first, second, l) {
+ r = sd_bus_message_append(m, "(ss)",
+ !isempty(*second) ? *first : "root",
+ !isempty(*second) ? *second : *first);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "MountImages")) {
+ const char *p = eq;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(ssba(ss))");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(ssba(ss))");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *first = NULL, *second = NULL, *tuple = NULL;
+ const char *q = NULL, *source = NULL;
+ bool permissive = false;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse MountImages= property: %s", eq);
+ if (r == 0)
+ break;
+
+ q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &first, &second, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse MountImages= property: %s", eq);
+ if (r == 0)
+ continue;
+
+ source = first;
+ if (source[0] == '-') {
+ permissive = true;
+ source++;
+ }
+
+ if (isempty(second))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Missing argument after ':': %s",
+ eq);
+
+ r = sd_bus_message_open_container(m, 'r', "ssba(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "ssb", source, second, permissive);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *partition = NULL, *mount_options = NULL;
+
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse MountImages= property: %s", eq);
+ if (r == 0)
+ break;
+ /* Single set of options, applying to the root partition/single filesystem */
+ if (r == 1) {
+ r = sd_bus_message_append(m, "(ss)", "root", partition);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ break;
+ }
+
+ r = sd_bus_message_append(m, "(ss)", partition, mount_options);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "ExtensionImages")) {
+ const char *p = eq;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(sba(ss))");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(sba(ss))");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *source = NULL, *tuple = NULL;
+ const char *q = NULL, *s = NULL;
+ bool permissive = false;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse ExtensionImages= property: %s", eq);
+ if (r == 0)
+ break;
+
+ q = tuple;
+ r = extract_first_word(&q, &source, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse ExtensionImages= property: %s", eq);
+ if (r == 0)
+ continue;
+
+ s = source;
+ if (s[0] == '-') {
+ permissive = true;
+ s++;
+ }
+
+ r = sd_bus_message_open_container(m, 'r', "sba(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "sb", s, permissive);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(ss)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ for (;;) {
+ _cleanup_free_ char *partition = NULL, *mount_options = NULL;
+
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE|EXTRACT_UNESCAPE_SEPARATORS, &partition, &mount_options, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse ExtensionImages= property: %s", eq);
+ if (r == 0)
+ break;
+ /* Single set of options, applying to the root partition/single filesystem */
+ if (r == 1) {
+ r = sd_bus_message_append(m, "(ss)", "root", partition);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ break;
+ }
+
+ r = sd_bus_message_append(m, "(ss)", partition, mount_options);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "StateDirectory", "RuntimeDirectory", "CacheDirectory", "LogsDirectory")) {
+ _cleanup_strv_free_ char **symlinks = NULL, **sources = NULL;
+ const char *p = eq;
+
+ /* Adding new directories is supported from both *DirectorySymlink methods and the
+ * older ones, so first parse the input, and if we are given a new-style src:dst
+ * tuple use the new method, else use the old one. */
+
+ for (;;) {
+ _cleanup_free_ char *tuple = NULL, *source = NULL, *destination = NULL;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse argument: %m");
+ if (r == 0)
+ break;
+
+ const char *t = tuple;
+ r = extract_many_words(&t, ":", EXTRACT_UNQUOTE|EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
+ if (r <= 0)
+ return log_error_errno(r ?: SYNTHETIC_ERRNO(EINVAL), "Failed to parse argument: %m");
+
+ path_simplify(source);
+
+ if (isempty(destination)) {
+ r = strv_consume(&sources, TAKE_PTR(source));
+ if (r < 0)
+ return bus_log_create_error(r);
+ } else {
+ path_simplify(destination);
+
+ r = strv_consume_pair(&symlinks, TAKE_PTR(source), TAKE_PTR(destination));
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ if (!strv_isempty(sources)) {
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "as");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_strv(m, sources);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ /* For State and Runtime directories we support an optional destination parameter, which
+ * will be used to create a symlink to the source. But it is new so we cannot change the
+ * old DBUS signatures, so append a new message type. */
+ if (!strv_isempty(symlinks)) {
+ const char *symlink_field;
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ if (streq(field, "StateDirectory"))
+ symlink_field = "StateDirectorySymlink";
+ else if (streq(field, "RuntimeDirectory"))
+ symlink_field = "RuntimeDirectorySymlink";
+ else if (streq(field, "CacheDirectory"))
+ symlink_field = "CacheDirectorySymlink";
+ else if (streq(field, "LogsDirectory"))
+ symlink_field = "LogsDirectorySymlink";
+ else
+ assert_not_reached();
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, symlink_field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "a(sst)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'a', "(sst)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ STRV_FOREACH_PAIR(source, destination, symlinks) {
+ r = sd_bus_message_append(m, "(sst)", *source, *destination, UINT64_C(0));
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bus_append_kill_property(sd_bus_message *m, const char *field, const char *eq) {
+ if (streq(field, "KillMode"))
+ return bus_append_string(m, field, eq);
+
+ if (STR_IN_SET(field, "SendSIGHUP",
+ "SendSIGKILL"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "KillSignal",
+ "RestartKillSignal",
+ "FinalKillSignal",
+ "WatchdogSignal",
+ "ReloadSignal"))
+ return bus_append_signal_from_string(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_mount_property(sd_bus_message *m, const char *field, const char *eq) {
+
+ if (STR_IN_SET(field, "What",
+ "Where",
+ "Options",
+ "Type"))
+ return bus_append_string(m, field, eq);
+
+ if (streq(field, "TimeoutSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (streq(field, "DirectoryMode"))
+ return bus_append_parse_mode(m, field, eq);
+
+ if (STR_IN_SET(field, "SloppyOptions",
+ "LazyUnmount",
+ "ForceUnmount",
+ "ReadwriteOnly"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_path_property(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ if (streq(field, "MakeDirectory"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (streq(field, "DirectoryMode"))
+ return bus_append_parse_mode(m, field, eq);
+
+ if (STR_IN_SET(field, "PathExists",
+ "PathExistsGlob",
+ "PathChanged",
+ "PathModified",
+ "DirectoryNotEmpty")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", "Paths", "a(ss)", 0);
+ else
+ r = sd_bus_message_append(m, "(sv)", "Paths", "a(ss)", 1, field, eq);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (STR_IN_SET(field, "TriggerLimitBurst", "PollLimitBurst"))
+ return bus_append_safe_atou(m, field, eq);
+
+ if (STR_IN_SET(field, "TriggerLimitIntervalSec", "PollLimitIntervalSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_scope_property(sd_bus_message *m, const char *field, const char *eq) {
+ if (streq(field, "RuntimeMaxSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (streq(field, "RuntimeRandomizedExtraSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (streq(field, "TimeoutStopSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ /* Scope units don't have execution context but we still want to allow setting these two,
+ * so let's handle them separately. */
+ if (STR_IN_SET(field, "User", "Group"))
+ return bus_append_string(m, field, eq);
+
+ if (streq(field, "OOMPolicy"))
+ return bus_append_string(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_service_property(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ if (STR_IN_SET(field, "PIDFile",
+ "Type",
+ "ExitType",
+ "Restart",
+ "RestartMode",
+ "BusName",
+ "NotifyAccess",
+ "USBFunctionDescriptors",
+ "USBFunctionStrings",
+ "OOMPolicy",
+ "TimeoutStartFailureMode",
+ "TimeoutStopFailureMode",
+ "FileDescriptorStorePreserve"))
+ return bus_append_string(m, field, eq);
+
+ if (STR_IN_SET(field, "PermissionsStartOnly",
+ "RootDirectoryStartOnly",
+ "RemainAfterExit",
+ "GuessMainPID"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "RestartSec",
+ "RestartMaxDelaySec",
+ "TimeoutStartSec",
+ "TimeoutStopSec",
+ "TimeoutAbortSec",
+ "RuntimeMaxSec",
+ "RuntimeRandomizedExtraSec",
+ "WatchdogSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (streq(field, "TimeoutSec")) {
+ r = bus_append_parse_sec_rename(m, "TimeoutStartSec", eq);
+ if (r < 0)
+ return r;
+
+ return bus_append_parse_sec_rename(m, "TimeoutStopSec", eq);
+ }
+
+ if (STR_IN_SET(field, "FileDescriptorStoreMax",
+ "RestartSteps"))
+ return bus_append_safe_atou(m, field, eq);
+
+ if (STR_IN_SET(field, "ExecCondition",
+ "ExecStartPre",
+ "ExecStart",
+ "ExecStartPost",
+ "ExecConditionEx",
+ "ExecStartPreEx",
+ "ExecStartEx",
+ "ExecStartPostEx",
+ "ExecReload",
+ "ExecStop",
+ "ExecStopPost",
+ "ExecReloadEx",
+ "ExecStopEx",
+ "ExecStopPostEx"))
+ return bus_append_exec_command(m, field, eq);
+
+ if (STR_IN_SET(field, "RestartPreventExitStatus",
+ "RestartForceExitStatus",
+ "SuccessExitStatus")) {
+ _cleanup_free_ int *status = NULL, *signal = NULL;
+ size_t n_status = 0, n_signal = 0;
+ const char *p;
+
+ for (p = eq;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == 0)
+ break;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Invalid syntax in %s: %s", field, eq);
+
+ /* We need to call exit_status_from_string() first, because we want
+ * to parse numbers as exit statuses, not signals. */
+
+ r = exit_status_from_string(word);
+ if (r >= 0) {
+ assert(r >= 0 && r < 256);
+
+ status = reallocarray(status, n_status + 1, sizeof(int));
+ if (!status)
+ return log_oom();
+
+ status[n_status++] = r;
+
+ } else if ((r = signal_from_string(word)) >= 0) {
+ signal = reallocarray(signal, n_signal + 1, sizeof(int));
+ if (!signal)
+ return log_oom();
+
+ signal[n_signal++] = r;
+
+ } else
+ /* original r from exit_status_to_string() */
+ return log_error_errno(r, "Invalid status or signal %s in %s: %m",
+ word, field);
+ }
+
+ r = sd_bus_message_open_container(m, SD_BUS_TYPE_STRUCT, "sv");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_basic(m, SD_BUS_TYPE_STRING, field);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'v', "(aiai)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_open_container(m, 'r', "aiai");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_array(m, 'i', status, n_status * sizeof(int));
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append_array(m, 'i', signal, n_signal * sizeof(int));
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "OpenFile"))
+ return bus_append_open_file(m, field, eq);
+
+ return 0;
+}
+
+static int bus_append_socket_property(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ if (STR_IN_SET(field, "Accept",
+ "FlushPending",
+ "Writable",
+ "KeepAlive",
+ "NoDelay",
+ "FreeBind",
+ "Transparent",
+ "Broadcast",
+ "PassCredentials",
+ "PassSecurity",
+ "PassPacketInfo",
+ "ReusePort",
+ "RemoveOnStop",
+ "SELinuxContextFromNet"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "Priority",
+ "IPTTL",
+ "Mark"))
+ return bus_append_safe_atoi(m, field, eq);
+
+ if (streq(field, "IPTOS"))
+ return bus_append_ip_tos_from_string(m, field, eq);
+
+ if (STR_IN_SET(field, "Backlog",
+ "MaxConnections",
+ "MaxConnectionsPerSource",
+ "KeepAliveProbes",
+ "TriggerLimitBurst",
+ "PollLimitBurst"))
+ return bus_append_safe_atou(m, field, eq);
+
+ if (STR_IN_SET(field, "SocketMode",
+ "DirectoryMode"))
+ return bus_append_parse_mode(m, field, eq);
+
+ if (STR_IN_SET(field, "MessageQueueMaxMessages",
+ "MessageQueueMessageSize"))
+ return bus_append_safe_atoi64(m, field, eq);
+
+ if (STR_IN_SET(field, "TimeoutSec",
+ "KeepAliveTimeSec",
+ "KeepAliveIntervalSec",
+ "DeferAcceptSec",
+ "TriggerLimitIntervalSec",
+ "PollLimitIntervalSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (STR_IN_SET(field, "ReceiveBuffer",
+ "SendBuffer",
+ "PipeSize"))
+ return bus_append_parse_size(m, field, eq, 1024);
+
+ if (STR_IN_SET(field, "ExecStartPre",
+ "ExecStartPost",
+ "ExecReload",
+ "ExecStopPost"))
+ return bus_append_exec_command(m, field, eq);
+
+ if (STR_IN_SET(field, "SmackLabel",
+ "SmackLabelIPIn",
+ "SmackLabelIPOut",
+ "TCPCongestion",
+ "BindToDevice",
+ "BindIPv6Only",
+ "FileDescriptorName",
+ "SocketUser",
+ "SocketGroup",
+ "Timestamping"))
+ return bus_append_string(m, field, eq);
+
+ if (streq(field, "Symlinks"))
+ return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE);
+
+ if (streq(field, "SocketProtocol"))
+ return bus_append_parse_ip_protocol(m, field, eq);
+
+ if (STR_IN_SET(field, "ListenStream",
+ "ListenDatagram",
+ "ListenSequentialPacket",
+ "ListenNetlink",
+ "ListenSpecial",
+ "ListenMessageQueue",
+ "ListenFIFO",
+ "ListenUSBFunction")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", "Listen", "a(ss)", 0);
+ else
+ r = sd_bus_message_append(m, "(sv)", "Listen", "a(ss)", 1, field + STRLEN("Listen"), eq);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ return 0;
+}
+static int bus_append_timer_property(sd_bus_message *m, const char *field, const char *eq) {
+ int r;
+
+ if (STR_IN_SET(field, "WakeSystem",
+ "RemainAfterElapse",
+ "Persistent",
+ "OnTimezoneChange",
+ "OnClockChange",
+ "FixedRandomDelay"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "AccuracySec",
+ "RandomizedDelaySec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (STR_IN_SET(field, "OnActiveSec",
+ "OnBootSec",
+ "OnStartupSec",
+ "OnUnitActiveSec",
+ "OnUnitInactiveSec")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", "TimersMonotonic", "a(st)", 0);
+ else {
+ usec_t t;
+ r = parse_sec(eq, &t);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s=%s: %m", field, eq);
+
+ r = sd_bus_message_append(m, "(sv)", "TimersMonotonic", "a(st)", 1, field, t);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (streq(field, "OnCalendar")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", "TimersCalendar", "a(ss)", 0);
+ else
+ r = sd_bus_message_append(m, "(sv)", "TimersCalendar", "a(ss)", 1, field, eq);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bus_append_unit_property(sd_bus_message *m, const char *field, const char *eq) {
+ ConditionType t = _CONDITION_TYPE_INVALID;
+ bool is_condition = false;
+ int r;
+
+ if (STR_IN_SET(field, "Description",
+ "SourcePath",
+ "OnFailureJobMode",
+ "JobTimeoutAction",
+ "JobTimeoutRebootArgument",
+ "StartLimitAction",
+ "FailureAction",
+ "SuccessAction",
+ "RebootArgument",
+ "CollectMode"))
+ return bus_append_string(m, field, eq);
+
+ if (STR_IN_SET(field, "StopWhenUnneeded",
+ "RefuseManualStart",
+ "RefuseManualStop",
+ "AllowIsolate",
+ "IgnoreOnIsolate",
+ "SurviveFinalKillSignal",
+ "DefaultDependencies"))
+ return bus_append_parse_boolean(m, field, eq);
+
+ if (STR_IN_SET(field, "JobTimeoutSec",
+ "JobRunningTimeoutSec",
+ "StartLimitIntervalSec"))
+ return bus_append_parse_sec_rename(m, field, eq);
+
+ if (streq(field, "StartLimitBurst"))
+ return bus_append_safe_atou(m, field, eq);
+
+ if (STR_IN_SET(field, "SuccessActionExitStatus",
+ "FailureActionExitStatus")) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", field, "i", -1);
+ else {
+ uint8_t u;
+
+ r = safe_atou8(eq, &u);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse %s=%s", field, eq);
+
+ r = sd_bus_message_append(m, "(sv)", field, "i", (int) u);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ if (unit_dependency_from_string(field) >= 0 ||
+ STR_IN_SET(field, "Documentation",
+ "RequiresMountsFor",
+ "Markers"))
+ return bus_append_strv(m, field, eq, EXTRACT_UNQUOTE);
+
+ t = condition_type_from_string(field);
+ if (t >= 0)
+ is_condition = true;
+ else
+ t = assert_type_from_string(field);
+ if (t >= 0) {
+ if (isempty(eq))
+ r = sd_bus_message_append(m, "(sv)", is_condition ? "Conditions" : "Asserts", "a(sbbs)", 0);
+ else {
+ const char *p = eq;
+ int trigger, negate;
+
+ trigger = *p == '|';
+ if (trigger)
+ p++;
+
+ negate = *p == '!';
+ if (negate)
+ p++;
+
+ r = sd_bus_message_append(m, "(sv)", is_condition ? "Conditions" : "Asserts", "a(sbbs)", 1,
+ field, trigger, negate, p);
+ }
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ return 1;
+ }
+
+ return 0;
+}
+
+int bus_append_unit_property_assignment(sd_bus_message *m, UnitType t, const char *assignment) {
+ const char *eq, *field;
+ int r;
+
+ assert(m);
+ assert(assignment);
+
+ eq = strchr(assignment, '=');
+ if (!eq)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Not an assignment: %s", assignment);
+
+ field = strndupa_safe(assignment, eq - assignment);
+ eq++;
+
+ switch (t) {
+ case UNIT_SERVICE:
+ r = bus_append_cgroup_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_execute_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_kill_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_service_property(m, field, eq);
+ if (r != 0)
+ return r;
+ break;
+
+ case UNIT_SOCKET:
+ r = bus_append_cgroup_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_execute_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_kill_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_socket_property(m, field, eq);
+ if (r != 0)
+ return r;
+ break;
+
+ case UNIT_TIMER:
+ r = bus_append_timer_property(m, field, eq);
+ if (r != 0)
+ return r;
+ break;
+
+ case UNIT_PATH:
+ r = bus_append_path_property(m, field, eq);
+ if (r != 0)
+ return r;
+ break;
+
+ case UNIT_SLICE:
+ r = bus_append_cgroup_property(m, field, eq);
+ if (r != 0)
+ return r;
+ break;
+
+ case UNIT_SCOPE:
+ r = bus_append_cgroup_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_kill_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_scope_property(m, field, eq);
+ if (r != 0)
+ return r;
+ break;
+
+ case UNIT_MOUNT:
+ r = bus_append_cgroup_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_execute_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_kill_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ r = bus_append_mount_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ break;
+
+ case UNIT_AUTOMOUNT:
+ r = bus_append_automount_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ break;
+
+ case UNIT_TARGET:
+ case UNIT_DEVICE:
+ case UNIT_SWAP:
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ r = bus_append_unit_property(m, field, eq);
+ if (r != 0)
+ return r;
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Unknown assignment: %s", assignment);
+}
+
+int bus_append_unit_property_assignment_many(sd_bus_message *m, UnitType t, char **l) {
+ int r;
+
+ assert(m);
+
+ STRV_FOREACH(i, l) {
+ r = bus_append_unit_property_assignment(m, t, *i);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int bus_append_scope_pidref(sd_bus_message *m, const PidRef *pidref) {
+ assert(m);
+
+ if (!pidref_is_set(pidref))
+ return -ESRCH;
+
+ if (pidref->fd >= 0)
+ return sd_bus_message_append(
+ m, "(sv)",
+ "PIDFDs", "ah", 1, pidref->fd);
+
+ return sd_bus_message_append(
+ m, "(sv)",
+ "PIDs", "au", 1, pidref->pid);
+}
+
+int bus_deserialize_and_dump_unit_file_changes(sd_bus_message *m, bool quiet) {
+ const char *type, *path, *source;
+ InstallChange *changes = NULL;
+ size_t n_changes = 0;
+ int r;
+
+ CLEANUP_ARRAY(changes, n_changes, install_changes_free);
+
+ r = sd_bus_message_enter_container(m, SD_BUS_TYPE_ARRAY, "(sss)");
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ while ((r = sd_bus_message_read(m, "(sss)", &type, &path, &source)) > 0) {
+ InstallChangeType t;
+
+ /* We expect only "success" changes to be sent over the bus. Hence, reject anything
+ * negative. */
+ t = install_change_type_from_string(type);
+ if (t < 0) {
+ log_notice_errno(t, "Manager reported unknown change type \"%s\" for path \"%s\", ignoring.",
+ type, path);
+ continue;
+ }
+
+ r = install_changes_add(&changes, &n_changes, t, path, source);
+ if (r < 0)
+ return r;
+ }
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ r = sd_bus_message_exit_container(m);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ install_changes_dump(0, NULL, changes, n_changes, quiet);
+
+ return 0;
+}
+
+int unit_load_state(sd_bus *bus, const char *name, char **load_state) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ path = unit_dbus_path_from_name(name);
+ if (!path)
+ return log_oom();
+
+ /* This function warns on its own, because otherwise it'd be awkward to pass
+ * the dbus error message around. */
+
+ r = sd_bus_get_property_string(
+ bus,
+ "org.freedesktop.systemd1",
+ path,
+ "org.freedesktop.systemd1.Unit",
+ "LoadState",
+ &error,
+ load_state);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get load state of %s: %s", name, bus_error_message(&error, r));
+
+ return 0;
+}
+
+int unit_info_compare(const UnitInfo *a, const UnitInfo *b) {
+ int r;
+
+ /* First, order by machine */
+ r = strcasecmp_ptr(a->machine, b->machine);
+ if (r != 0)
+ return r;
+
+ /* Second, order by unit type */
+ r = strcasecmp_ptr(strrchr(a->id, '.'), strrchr(b->id, '.'));
+ if (r != 0)
+ return r;
+
+ /* Third, order by name */
+ return strcasecmp(a->id, b->id);
+}
+
+int bus_service_manager_reload(sd_bus *bus) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL;
+ int r;
+
+ assert(bus);
+
+ r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "Reload");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ /* Reloading the daemon may take long, hence set a longer timeout here */
+ r = sd_bus_call(bus, m, DAEMON_RELOAD_TIMEOUT_SEC, &error, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to reload service manager: %s", bus_error_message(&error, r));
+
+ return 0;
+}
diff --git a/src/shared/bus-unit-util.h b/src/shared/bus-unit-util.h
new file mode 100644
index 0000000..d52c847
--- /dev/null
+++ b/src/shared/bus-unit-util.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "install.h"
+#include "pidref.h"
+#include "unit-def.h"
+
+typedef struct UnitInfo {
+ const char *machine;
+ const char *id;
+ const char *description;
+ const char *load_state;
+ const char *active_state;
+ const char *sub_state;
+ const char *following;
+ const char *unit_path;
+ uint32_t job_id;
+ const char *job_type;
+ const char *job_path;
+} UnitInfo;
+
+int bus_parse_unit_info(sd_bus_message *message, UnitInfo *u);
+
+int bus_append_unit_property_assignment(sd_bus_message *m, UnitType t, const char *assignment);
+int bus_append_unit_property_assignment_many(sd_bus_message *m, UnitType t, char **l);
+
+int bus_append_scope_pidref(sd_bus_message *m, const PidRef *pidref);
+
+int bus_deserialize_and_dump_unit_file_changes(sd_bus_message *m, bool quiet);
+
+int unit_load_state(sd_bus *bus, const char *name, char **load_state);
+
+int unit_info_compare(const UnitInfo *a, const UnitInfo *b);
+
+int bus_service_manager_reload(sd_bus *bus);
diff --git a/src/shared/bus-util.c b/src/shared/bus-util.c
new file mode 100644
index 0000000..4123152
--- /dev/null
+++ b/src/shared/bus-util.c
@@ -0,0 +1,711 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#include "sd-bus.h"
+#include "sd-daemon.h"
+#include "sd-event.h"
+#include "sd-id128.h"
+
+#include "bus-common-errors.h"
+#include "bus-internal.h"
+#include "bus-label.h"
+#include "bus-util.h"
+#include "data-fd-util.h"
+#include "fd-util.h"
+#include "memstream-util.h"
+#include "path-util.h"
+#include "socket-util.h"
+#include "stdio-util.h"
+
+static int name_owner_change_callback(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) {
+ sd_event *e = ASSERT_PTR(userdata);
+
+ assert(m);
+
+ sd_bus_close(sd_bus_message_get_bus(m));
+ sd_event_exit(e, 0);
+
+ return 1;
+}
+
+int bus_log_address_error(int r, BusTransport transport) {
+ bool hint = transport == BUS_TRANSPORT_LOCAL && r == -ENOMEDIUM;
+
+ return log_error_errno(r,
+ hint ? "Failed to set bus address: $DBUS_SESSION_BUS_ADDRESS and $XDG_RUNTIME_DIR not defined (consider using --machine=<user>@.host --user to connect to bus of other user)" :
+ "Failed to set bus address: %m");
+}
+
+int bus_log_connect_error(int r, BusTransport transport) {
+ bool hint_vars = transport == BUS_TRANSPORT_LOCAL && r == -ENOMEDIUM,
+ hint_addr = transport == BUS_TRANSPORT_LOCAL && ERRNO_IS_PRIVILEGE(r);
+
+ return log_error_errno(r,
+ r == hint_vars ? "Failed to connect to bus: $DBUS_SESSION_BUS_ADDRESS and $XDG_RUNTIME_DIR not defined (consider using --machine=<user>@.host --user to connect to bus of other user)" :
+ r == hint_addr ? "Failed to connect to bus: Operation not permitted (consider using --machine=<user>@.host --user to connect to bus of other user)" :
+ "Failed to connect to bus: %m");
+}
+
+int bus_async_unregister_and_exit(sd_event *e, sd_bus *bus, const char *name) {
+ const char *match;
+ const char *unique;
+ int r;
+
+ assert(e);
+ assert(bus);
+ assert(name);
+
+ /* We unregister the name here and then wait for the
+ * NameOwnerChanged signal for this event to arrive before we
+ * quit. We do this in order to make sure that any queued
+ * requests are still processed before we really exit. */
+
+ r = sd_bus_get_unique_name(bus, &unique);
+ if (r < 0)
+ return r;
+
+ match = strjoina(
+ "sender='org.freedesktop.DBus',"
+ "type='signal',"
+ "interface='org.freedesktop.DBus',"
+ "member='NameOwnerChanged',"
+ "path='/org/freedesktop/DBus',"
+ "arg0='", name, "',",
+ "arg1='", unique, "',",
+ "arg2=''");
+
+ r = sd_bus_add_match_async(bus, NULL, match, name_owner_change_callback, NULL, e);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_release_name_async(bus, NULL, name, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int bus_event_loop_with_idle(
+ sd_event *e,
+ sd_bus *bus,
+ const char *name,
+ usec_t timeout,
+ check_idle_t check_idle,
+ void *userdata) {
+
+ bool exiting = false;
+ int r, code;
+
+ assert(e);
+ assert(bus);
+ assert(name);
+
+ for (;;) {
+ bool idle;
+
+ r = sd_event_get_state(e);
+ if (r < 0)
+ return r;
+ if (r == SD_EVENT_FINISHED)
+ break;
+
+ if (check_idle)
+ idle = check_idle(userdata);
+ else
+ idle = true;
+
+ r = sd_event_run(e, exiting || !idle ? UINT64_MAX : timeout);
+ if (r < 0)
+ return r;
+
+ if (r == 0 && !exiting && idle) {
+ /* Inform the service manager that we are going down, so that it will queue all
+ * further start requests, instead of assuming we are already running. */
+ sd_notify(false, "STOPPING=1");
+
+ r = bus_async_unregister_and_exit(e, bus, name);
+ if (r < 0)
+ return r;
+
+ exiting = true;
+ }
+ }
+
+ r = sd_event_get_exit_code(e, &code);
+ if (r < 0)
+ return r;
+
+ return code;
+}
+
+int bus_name_has_owner(sd_bus *c, const char *name, sd_bus_error *error) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *rep = NULL;
+ int r, has_owner = 0;
+
+ assert(c);
+ assert(name);
+
+ r = sd_bus_call_method(c,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/dbus",
+ "org.freedesktop.DBus",
+ "NameHasOwner",
+ error,
+ &rep,
+ "s",
+ name);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_read_basic(rep, 'b', &has_owner);
+ if (r < 0)
+ return sd_bus_error_set_errno(error, r);
+
+ return has_owner;
+}
+
+bool bus_error_is_unknown_service(const sd_bus_error *error) {
+ return sd_bus_error_has_names(error,
+ SD_BUS_ERROR_SERVICE_UNKNOWN,
+ SD_BUS_ERROR_NAME_HAS_NO_OWNER,
+ BUS_ERROR_NO_SUCH_UNIT);
+}
+
+int bus_check_peercred(sd_bus *c) {
+ struct ucred ucred;
+ int fd, r;
+
+ assert(c);
+
+ fd = sd_bus_get_fd(c);
+ if (fd < 0)
+ return fd;
+
+ r = getpeercred(fd, &ucred);
+ if (r < 0)
+ return r;
+
+ if (ucred.uid != 0 && ucred.uid != geteuid())
+ return -EPERM;
+
+ return 1;
+}
+
+int bus_connect_system_systemd(sd_bus **ret_bus) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ int r;
+
+ assert(ret_bus);
+
+ if (geteuid() != 0)
+ return sd_bus_default_system(ret_bus);
+
+ /* If we are root then let's talk directly to the system
+ * instance, instead of going via the bus */
+
+ r = sd_bus_new(&bus);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_set_address(bus, "unix:path=/run/systemd/private");
+ if (r < 0)
+ return r;
+
+ r = sd_bus_start(bus);
+ if (r < 0)
+ return sd_bus_default_system(ret_bus);
+
+ r = bus_check_peercred(bus);
+ if (r < 0)
+ return r;
+
+ *ret_bus = TAKE_PTR(bus);
+ return 0;
+}
+
+int bus_connect_user_systemd(sd_bus **ret_bus) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_free_ char *ee = NULL;
+ const char *e;
+ int r;
+
+ assert(ret_bus);
+
+ e = secure_getenv("XDG_RUNTIME_DIR");
+ if (!e)
+ return sd_bus_default_user(ret_bus);
+
+ ee = bus_address_escape(e);
+ if (!ee)
+ return -ENOMEM;
+
+ r = sd_bus_new(&bus);
+ if (r < 0)
+ return r;
+
+ bus->address = strjoin("unix:path=", ee, "/systemd/private");
+ if (!bus->address)
+ return -ENOMEM;
+
+ r = sd_bus_start(bus);
+ if (r < 0)
+ return sd_bus_default_user(ret_bus);
+
+ r = bus_check_peercred(bus);
+ if (r < 0)
+ return r;
+
+ *ret_bus = TAKE_PTR(bus);
+ return 0;
+}
+
+int bus_connect_transport(
+ BusTransport transport,
+ const char *host,
+ RuntimeScope runtime_scope,
+ sd_bus **ret) {
+
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ int r;
+
+ assert(transport >= 0);
+ assert(transport < _BUS_TRANSPORT_MAX);
+ assert(ret);
+
+ assert_return((transport == BUS_TRANSPORT_LOCAL) == !host, -EINVAL);
+ assert_return(transport != BUS_TRANSPORT_REMOTE || runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP);
+
+ switch (transport) {
+
+ case BUS_TRANSPORT_LOCAL:
+
+ switch (runtime_scope) {
+
+ case RUNTIME_SCOPE_USER:
+ r = sd_bus_default_user(&bus);
+ break;
+
+ case RUNTIME_SCOPE_SYSTEM:
+ if (sd_booted() <= 0)
+ /* Print a friendly message when the local system is actually not running systemd as PID 1. */
+ return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+ "System has not been booted with systemd as init system (PID 1). Can't operate.");
+ r = sd_bus_default_system(&bus);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ break;
+
+ case BUS_TRANSPORT_REMOTE:
+ r = sd_bus_open_system_remote(&bus, host);
+ break;
+
+ case BUS_TRANSPORT_MACHINE:
+
+ switch (runtime_scope) {
+
+ case RUNTIME_SCOPE_USER:
+ r = sd_bus_open_user_machine(&bus, host);
+ break;
+
+ case RUNTIME_SCOPE_SYSTEM:
+ r = sd_bus_open_system_machine(&bus, host);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ if (r < 0)
+ return r;
+
+ r = sd_bus_set_exit_on_disconnect(bus, true);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(bus);
+ return 0;
+}
+
+int bus_connect_transport_systemd(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus) {
+ assert(transport >= 0);
+ assert(transport < _BUS_TRANSPORT_MAX);
+ assert(bus);
+
+ assert_return((transport == BUS_TRANSPORT_LOCAL) == !host, -EINVAL);
+ assert_return(transport == BUS_TRANSPORT_LOCAL || runtime_scope == RUNTIME_SCOPE_SYSTEM, -EOPNOTSUPP);
+
+ switch (transport) {
+
+ case BUS_TRANSPORT_LOCAL:
+ switch (runtime_scope) {
+
+ case RUNTIME_SCOPE_USER:
+ return bus_connect_user_systemd(bus);
+
+ case RUNTIME_SCOPE_SYSTEM:
+ if (sd_booted() <= 0)
+ /* Print a friendly message when the local system is actually not running systemd as PID 1. */
+ return log_error_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
+ "System has not been booted with systemd as init system (PID 1). Can't operate.");
+ return bus_connect_system_systemd(bus);
+
+ default:
+ assert_not_reached();
+ }
+
+ break;
+
+ case BUS_TRANSPORT_REMOTE:
+ return sd_bus_open_system_remote(bus, host);
+
+ case BUS_TRANSPORT_MACHINE:
+ return sd_bus_open_system_machine(bus, host);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+/**
+ * bus_path_encode_unique() - encode unique object path
+ * @b: bus connection or NULL
+ * @prefix: object path prefix
+ * @sender_id: unique-name of client, or NULL
+ * @external_id: external ID to be chosen by client, or NULL
+ * @ret_path: storage for encoded object path pointer
+ *
+ * Whenever we provide a bus API that allows clients to create and manage
+ * server-side objects, we need to provide a unique name for these objects. If
+ * we let the server choose the name, we suffer from a race condition: If a
+ * client creates an object asynchronously, it cannot destroy that object until
+ * it received the method reply. It cannot know the name of the new object,
+ * thus, it cannot destroy it. Furthermore, it enforces a round-trip.
+ *
+ * Therefore, many APIs allow the client to choose the unique name for newly
+ * created objects. There're two problems to solve, though:
+ * 1) Object names are usually defined via dbus object paths, which are
+ * usually globally namespaced. Therefore, multiple clients must be able
+ * to choose unique object names without interference.
+ * 2) If multiple libraries share the same bus connection, they must be
+ * able to choose unique object names without interference.
+ * The first problem is solved easily by prefixing a name with the
+ * unique-bus-name of a connection. The server side must enforce this and
+ * reject any other name. The second problem is solved by providing unique
+ * suffixes from within sd-bus.
+ *
+ * This helper allows clients to create unique object-paths. It uses the
+ * template '/prefix/sender_id/external_id' and returns the new path in
+ * @ret_path (must be freed by the caller).
+ * If @sender_id is NULL, the unique-name of @b is used. If @external_id is
+ * NULL, this function allocates a unique suffix via @b (by requesting a new
+ * cookie). If both @sender_id and @external_id are given, @b can be passed as
+ * NULL.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int bus_path_encode_unique(sd_bus *b, const char *prefix, const char *sender_id, const char *external_id, char **ret_path) {
+ _cleanup_free_ char *sender_label = NULL, *external_label = NULL;
+ char external_buf[DECIMAL_STR_MAX(uint64_t)], *p;
+ int r;
+
+ assert_return(b || (sender_id && external_id), -EINVAL);
+ assert_return(sd_bus_object_path_is_valid(prefix), -EINVAL);
+ assert_return(ret_path, -EINVAL);
+
+ if (!sender_id) {
+ r = sd_bus_get_unique_name(b, &sender_id);
+ if (r < 0)
+ return r;
+ }
+
+ if (!external_id) {
+ xsprintf(external_buf, "%"PRIu64, ++b->cookie);
+ external_id = external_buf;
+ }
+
+ sender_label = bus_label_escape(sender_id);
+ if (!sender_label)
+ return -ENOMEM;
+
+ external_label = bus_label_escape(external_id);
+ if (!external_label)
+ return -ENOMEM;
+
+ p = path_join(prefix, sender_label, external_label);
+ if (!p)
+ return -ENOMEM;
+
+ *ret_path = p;
+ return 0;
+}
+
+/**
+ * bus_path_decode_unique() - decode unique object path
+ * @path: object path to decode
+ * @prefix: object path prefix
+ * @ret_sender: output parameter for sender-id label
+ * @ret_external: output parameter for external-id label
+ *
+ * This does the reverse of bus_path_encode_unique() (see its description for
+ * details). Both trailing labels, sender-id and external-id, are unescaped and
+ * returned in the given output parameters (the caller must free them).
+ *
+ * Note that this function returns 0 if the path does not match the template
+ * (see bus_path_encode_unique()), 1 if it matched.
+ *
+ * Returns: Negative error code on failure, 0 if the given object path does not
+ * match the template (return parameters are set to NULL), 1 if it was
+ * parsed successfully (return parameters contain allocated labels).
+ */
+int bus_path_decode_unique(const char *path, const char *prefix, char **ret_sender, char **ret_external) {
+ const char *p, *q;
+ char *sender, *external;
+
+ assert(sd_bus_object_path_is_valid(path));
+ assert(sd_bus_object_path_is_valid(prefix));
+ assert(ret_sender);
+ assert(ret_external);
+
+ p = object_path_startswith(path, prefix);
+ if (!p) {
+ *ret_sender = NULL;
+ *ret_external = NULL;
+ return 0;
+ }
+
+ q = strchr(p, '/');
+ if (!q) {
+ *ret_sender = NULL;
+ *ret_external = NULL;
+ return 0;
+ }
+
+ sender = bus_label_unescape_n(p, q - p);
+ external = bus_label_unescape(q + 1);
+ if (!sender || !external) {
+ free(sender);
+ free(external);
+ return -ENOMEM;
+ }
+
+ *ret_sender = sender;
+ *ret_external = external;
+ return 1;
+}
+
+int bus_track_add_name_many(sd_bus_track *t, char **l) {
+ int r = 0;
+
+ assert(t);
+
+ /* Continues adding after failure, and returns the first failure. */
+
+ STRV_FOREACH(i, l)
+ RET_GATHER(r, sd_bus_track_add_name(t, *i));
+ return r;
+}
+
+int bus_open_system_watch_bind_with_description(sd_bus **ret, const char *description) {
+ _cleanup_(sd_bus_close_unrefp) sd_bus *bus = NULL;
+ const char *e;
+ int r;
+
+ assert(ret);
+
+ /* Match like sd_bus_open_system(), but with the "watch_bind" feature and the Connected() signal
+ * turned on. */
+
+ r = sd_bus_new(&bus);
+ if (r < 0)
+ return r;
+
+ if (description) {
+ r = sd_bus_set_description(bus, description);
+ if (r < 0)
+ return r;
+ }
+
+ e = secure_getenv("DBUS_SYSTEM_BUS_ADDRESS");
+ if (!e)
+ e = DEFAULT_SYSTEM_BUS_ADDRESS;
+
+ r = sd_bus_set_address(bus, e);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_set_bus_client(bus, true);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_negotiate_creds(bus, true, SD_BUS_CREDS_UID|SD_BUS_CREDS_EUID|SD_BUS_CREDS_EFFECTIVE_CAPS);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_set_watch_bind(bus, true);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_set_connected_signal(bus, true);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_start(bus);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(bus);
+
+ return 0;
+}
+
+int bus_reply_pair_array(sd_bus_message *m, char **l) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ int r;
+
+ assert(m);
+
+ /* Reply to the specified message with a message containing a dictionary put together from the
+ * specified strv */
+
+ r = sd_bus_message_new_method_return(m, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_message_open_container(reply, 'a', "{ss}");
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH_PAIR(k, v, l) {
+ r = sd_bus_message_append(reply, "{ss}", *k, *v);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_bus_message_close_container(reply);
+ if (r < 0)
+ return r;
+
+ return sd_bus_send(NULL, reply, NULL);
+}
+
+static int method_dump_memory_state_by_fd(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) {
+ _cleanup_(memstream_done) MemStream m = {};
+ _cleanup_free_ char *dump = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ size_t dump_size;
+ FILE *f;
+ int r;
+
+ assert(message);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ r = RET_NERRNO(malloc_info(/* options= */ 0, f));
+ if (r < 0)
+ return r;
+
+ r = memstream_finalize(&m, &dump, &dump_size);
+ if (r < 0)
+ return r;
+
+ fd = acquire_data_fd(dump, dump_size, 0);
+ if (fd < 0)
+ return fd;
+
+ r = sd_bus_reply_method_return(message, "h", fd);
+ if (r < 0)
+ return r;
+
+ return 1; /* Stop further processing */
+}
+
+/* The default install callback will fail and disconnect the bus if it cannot register the match, but this
+ * is only a debug method, we definitely don't want to fail in case there's some permission issue. */
+static int dummy_install_callback(sd_bus_message *message, void *userdata, sd_bus_error *ret_error) {
+ return 1;
+}
+
+int bus_register_malloc_status(sd_bus *bus, const char *destination) {
+ const char *match;
+ int r;
+
+ assert(bus);
+ assert(!isempty(destination));
+
+ match = strjoina("type='method_call',"
+ "interface='org.freedesktop.MemoryAllocation1',"
+ "path='/org/freedesktop/MemoryAllocation1',"
+ "destination='", destination, "',",
+ "member='GetMallocInfo'");
+
+ r = sd_bus_add_match_async(bus, NULL, match, method_dump_memory_state_by_fd, dummy_install_callback, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to subscribe to GetMallocInfo() calls on MemoryAllocation1 interface: %m");
+
+ return 0;
+}
+
+static void bus_message_unref_wrapper(void *m) {
+ sd_bus_message_unref(m);
+}
+
+const struct hash_ops bus_message_hash_ops = {
+ .hash = trivial_hash_func,
+ .compare = trivial_compare_func,
+ .free_value = bus_message_unref_wrapper,
+};
+
+int bus_message_append_string_set(sd_bus_message *m, Set *set) {
+ const char *s;
+ int r;
+
+ assert(m);
+
+ r = sd_bus_message_open_container(m, 'a', "s");
+ if (r < 0)
+ return r;
+
+ SET_FOREACH(s, set) {
+ r = sd_bus_message_append(m, "s", s);
+ if (r < 0)
+ return r;
+ }
+
+ return sd_bus_message_close_container(m);
+}
+
+int bus_property_get_string_set(
+ sd_bus *bus,
+ const char *path,
+ const char *interface,
+ const char *property,
+ sd_bus_message *reply,
+ void *userdata,
+ sd_bus_error *error) {
+
+ Set **s = ASSERT_PTR(userdata);
+
+ assert(bus);
+ assert(property);
+ assert(reply);
+
+ return bus_message_append_string_set(reply, *s);
+}
diff --git a/src/shared/bus-util.h b/src/shared/bus-util.h
new file mode 100644
index 0000000..869c639
--- /dev/null
+++ b/src/shared/bus-util.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "sd-bus.h"
+#include "sd-event.h"
+
+#include "errno-util.h"
+#include "macro.h"
+#include "runtime-scope.h"
+#include "set.h"
+#include "string-util.h"
+#include "time-util.h"
+
+typedef enum BusTransport {
+ BUS_TRANSPORT_LOCAL,
+ BUS_TRANSPORT_REMOTE,
+ BUS_TRANSPORT_MACHINE,
+ _BUS_TRANSPORT_MAX,
+ _BUS_TRANSPORT_INVALID = -EINVAL,
+} BusTransport;
+
+int bus_async_unregister_and_exit(sd_event *e, sd_bus *bus, const char *name);
+
+typedef bool (*check_idle_t)(void *userdata);
+
+int bus_event_loop_with_idle(sd_event *e, sd_bus *bus, const char *name, usec_t timeout, check_idle_t check_idle, void *userdata);
+
+int bus_name_has_owner(sd_bus *c, const char *name, sd_bus_error *error);
+bool bus_error_is_unknown_service(const sd_bus_error *error);
+
+int bus_check_peercred(sd_bus *c);
+
+int bus_connect_system_systemd(sd_bus **ret_bus);
+int bus_connect_user_systemd(sd_bus **ret_bus);
+
+int bus_connect_transport(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus);
+int bus_connect_transport_systemd(BusTransport transport, const char *host, RuntimeScope runtime_scope, sd_bus **bus);
+
+int bus_log_address_error(int r, BusTransport transport);
+int bus_log_connect_error(int r, BusTransport transport);
+
+#define bus_log_parse_error(r) \
+ log_error_errno(r, "Failed to parse bus message: %m")
+
+#define bus_log_parse_error_debug(r) \
+ log_debug_errno(r, "Failed to parse bus message: %m")
+
+#define bus_log_create_error(r) \
+ log_error_errno(r, "Failed to create bus message: %m")
+
+int bus_path_encode_unique(sd_bus *b, const char *prefix, const char *sender_id, const char *external_id, char **ret_path);
+int bus_path_decode_unique(const char *path, const char *prefix, char **ret_sender, char **ret_external);
+
+int bus_track_add_name_many(sd_bus_track *t, char **l);
+
+int bus_open_system_watch_bind_with_description(sd_bus **ret, const char *description);
+static inline int bus_open_system_watch_bind(sd_bus **ret) {
+ return bus_open_system_watch_bind_with_description(ret, NULL);
+}
+
+int bus_reply_pair_array(sd_bus_message *m, char **l);
+
+/* Listen to GetMallocInfo() calls to 'destination' and return malloc_info() via FD */
+int bus_register_malloc_status(sd_bus *bus, const char *destination);
+
+extern const struct hash_ops bus_message_hash_ops;
+
+int bus_message_append_string_set(sd_bus_message *m, Set *s);
+
+int bus_property_get_string_set(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error);
diff --git a/src/shared/bus-wait-for-jobs.c b/src/shared/bus-wait-for-jobs.c
new file mode 100644
index 0000000..969c629
--- /dev/null
+++ b/src/shared/bus-wait-for-jobs.c
@@ -0,0 +1,333 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-wait-for-jobs.h"
+#include "set.h"
+#include "bus-util.h"
+#include "bus-internal.h"
+#include "unit-def.h"
+#include "escape.h"
+#include "strv.h"
+
+typedef struct BusWaitForJobs {
+ sd_bus *bus;
+
+ /* The set of jobs to wait for, as bus object paths */
+ Set *jobs;
+
+ /* The unit name and job result of the last Job message */
+ char *name;
+ char *result;
+
+ sd_bus_slot *slot_job_removed;
+ sd_bus_slot *slot_disconnected;
+} BusWaitForJobs;
+
+static int match_disconnected(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+ assert(m);
+
+ log_error("Warning! D-Bus connection terminated.");
+ sd_bus_close(sd_bus_message_get_bus(m));
+
+ return 0;
+}
+
+static int match_job_removed(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+ const char *path, *unit, *result;
+ BusWaitForJobs *d = ASSERT_PTR(userdata);
+ uint32_t id;
+ char *found;
+ int r;
+
+ assert(m);
+
+ r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result);
+ if (r < 0) {
+ bus_log_parse_error(r);
+ return 0;
+ }
+
+ found = set_remove(d->jobs, (char*) path);
+ if (!found)
+ return 0;
+
+ free(found);
+
+ (void) free_and_strdup(&d->result, empty_to_null(result));
+
+ (void) free_and_strdup(&d->name, empty_to_null(unit));
+
+ return 0;
+}
+
+BusWaitForJobs* bus_wait_for_jobs_free(BusWaitForJobs *d) {
+ if (!d)
+ return NULL;
+
+ set_free(d->jobs);
+
+ sd_bus_slot_unref(d->slot_disconnected);
+ sd_bus_slot_unref(d->slot_job_removed);
+
+ sd_bus_unref(d->bus);
+
+ free(d->name);
+ free(d->result);
+
+ return mfree(d);
+}
+
+int bus_wait_for_jobs_new(sd_bus *bus, BusWaitForJobs **ret) {
+ _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *d = NULL;
+ int r;
+
+ assert(bus);
+ assert(ret);
+
+ d = new(BusWaitForJobs, 1);
+ if (!d)
+ return -ENOMEM;
+
+ *d = (BusWaitForJobs) {
+ .bus = sd_bus_ref(bus),
+ };
+
+ /* When we are a bus client we match by sender. Direct
+ * connections OTOH have no initialized sender field, and
+ * hence we ignore the sender then */
+ r = sd_bus_match_signal_async(
+ bus,
+ &d->slot_job_removed,
+ bus->bus_client ? "org.freedesktop.systemd1" : NULL,
+ "/org/freedesktop/systemd1",
+ "org.freedesktop.systemd1.Manager",
+ "JobRemoved",
+ match_job_removed, NULL, d);
+ if (r < 0)
+ return r;
+
+ r = sd_bus_match_signal_async(
+ bus,
+ &d->slot_disconnected,
+ "org.freedesktop.DBus.Local",
+ NULL,
+ "org.freedesktop.DBus.Local",
+ "Disconnected",
+ match_disconnected, NULL, d);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(d);
+
+ return 0;
+}
+
+static int bus_process_wait(sd_bus *bus) {
+ int r;
+
+ for (;;) {
+ r = sd_bus_process(bus, NULL);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 0;
+
+ r = sd_bus_wait(bus, UINT64_MAX);
+ if (r < 0)
+ return r;
+ }
+}
+
+static int bus_job_get_service_result(BusWaitForJobs *d, char **result) {
+ _cleanup_free_ char *dbus_path = NULL;
+
+ assert(d);
+ assert(d->name);
+ assert(result);
+
+ if (!endswith(d->name, ".service"))
+ return -EINVAL;
+
+ dbus_path = unit_dbus_path_from_name(d->name);
+ if (!dbus_path)
+ return -ENOMEM;
+
+ return sd_bus_get_property_string(d->bus,
+ "org.freedesktop.systemd1",
+ dbus_path,
+ "org.freedesktop.systemd1.Service",
+ "Result",
+ NULL,
+ result);
+}
+
+static void log_job_error_with_service_result(const char* service, const char *result, const char* const* extra_args) {
+ _cleanup_free_ char *service_shell_quoted = NULL;
+ const char *systemctl = "systemctl", *journalctl = "journalctl";
+
+ static const struct {
+ const char *result, *explanation;
+ } explanations[] = {
+ { "resources", "of unavailable resources or another system error" },
+ { "protocol", "the service did not take the steps required by its unit configuration" },
+ { "timeout", "a timeout was exceeded" },
+ { "exit-code", "the control process exited with error code" },
+ { "signal", "a fatal signal was delivered to the control process" },
+ { "core-dump", "a fatal signal was delivered causing the control process to dump core" },
+ { "watchdog", "the service failed to send watchdog ping" },
+ { "start-limit", "start of the service was attempted too often" }
+ };
+
+ assert(service);
+
+ service_shell_quoted = shell_maybe_quote(service, 0);
+
+ if (!strv_isempty((char**) extra_args)) {
+ _cleanup_free_ char *t = NULL;
+
+ t = strv_join((char**) extra_args, " ");
+ systemctl = strjoina("systemctl ", t ?: "<args>");
+ journalctl = strjoina("journalctl ", t ?: "<args>");
+ }
+
+ if (!isempty(result)) {
+ size_t i;
+
+ for (i = 0; i < ELEMENTSOF(explanations); ++i)
+ if (streq(result, explanations[i].result))
+ break;
+
+ if (i < ELEMENTSOF(explanations)) {
+ log_error("Job for %s failed because %s.\n"
+ "See \"%s status %s\" and \"%s -xeu %s\" for details.\n",
+ service,
+ explanations[i].explanation,
+ systemctl,
+ service_shell_quoted ?: "<service>",
+ journalctl,
+ service_shell_quoted ?: "<service>");
+ goto finish;
+ }
+ }
+
+ log_error("Job for %s failed.\n"
+ "See \"%s status %s\" and \"%s -xeu %s\" for details.\n",
+ service,
+ systemctl,
+ service_shell_quoted ?: "<service>",
+ journalctl,
+ service_shell_quoted ?: "<service>");
+
+finish:
+ /* For some results maybe additional explanation is required */
+ if (streq_ptr(result, "start-limit"))
+ log_info("To force a start use \"%1$s reset-failed %2$s\"\n"
+ "followed by \"%1$s start %2$s\" again.",
+ systemctl,
+ service_shell_quoted ?: "<service>");
+}
+
+static int check_wait_response(BusWaitForJobs *d, bool quiet, const char* const* extra_args) {
+ assert(d);
+ assert(d->name);
+ assert(d->result);
+
+ if (!quiet) {
+ if (streq(d->result, "canceled"))
+ log_error("Job for %s canceled.", strna(d->name));
+ else if (streq(d->result, "timeout"))
+ log_error("Job for %s timed out.", strna(d->name));
+ else if (streq(d->result, "dependency"))
+ log_error("A dependency job for %s failed. See 'journalctl -xe' for details.", strna(d->name));
+ else if (streq(d->result, "invalid"))
+ log_error("%s is not active, cannot reload.", strna(d->name));
+ else if (streq(d->result, "assert"))
+ log_error("Assertion failed on job for %s.", strna(d->name));
+ else if (streq(d->result, "unsupported"))
+ log_error("Operation on or unit type of %s not supported on this system.", strna(d->name));
+ else if (streq(d->result, "collected"))
+ log_error("Queued job for %s was garbage collected.", strna(d->name));
+ else if (streq(d->result, "once"))
+ log_error("Unit %s was started already once and can't be started again.", strna(d->name));
+ else if (!STR_IN_SET(d->result, "done", "skipped")) {
+
+ if (d->name && endswith(d->name, ".service")) {
+ _cleanup_free_ char *result = NULL;
+ int q;
+
+ q = bus_job_get_service_result(d, &result);
+ if (q < 0)
+ log_debug_errno(q, "Failed to get Result property of unit %s: %m", d->name);
+
+ log_job_error_with_service_result(d->name, result, extra_args);
+ } else
+ log_error("Job failed. See \"journalctl -xe\" for details.");
+ }
+ }
+
+ if (STR_IN_SET(d->result, "canceled", "collected"))
+ return -ECANCELED;
+ else if (streq(d->result, "timeout"))
+ return -ETIME;
+ else if (streq(d->result, "dependency"))
+ return -EIO;
+ else if (streq(d->result, "invalid"))
+ return -ENOEXEC;
+ else if (streq(d->result, "assert"))
+ return -EPROTO;
+ else if (streq(d->result, "unsupported"))
+ return -EOPNOTSUPP;
+ else if (streq(d->result, "once"))
+ return -ESTALE;
+ else if (STR_IN_SET(d->result, "done", "skipped"))
+ return 0;
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO),
+ "Unexpected job result, assuming server side newer than us: %s", d->result);
+}
+
+int bus_wait_for_jobs(BusWaitForJobs *d, bool quiet, const char* const* extra_args) {
+ int r = 0;
+
+ assert(d);
+
+ while (!set_isempty(d->jobs)) {
+ int q;
+
+ q = bus_process_wait(d->bus);
+ if (q < 0)
+ return log_error_errno(q, "Failed to wait for response: %m");
+
+ if (d->name && d->result) {
+ q = check_wait_response(d, quiet, extra_args);
+ /* Return the first error as it is most likely to be
+ * meaningful. */
+ if (q < 0 && r == 0)
+ r = q;
+
+ log_full_errno_zerook(LOG_DEBUG, q,
+ "Got result %s/%m for job %s", d->result, d->name);
+ }
+
+ d->name = mfree(d->name);
+ d->result = mfree(d->result);
+ }
+
+ return r;
+}
+
+int bus_wait_for_jobs_add(BusWaitForJobs *d, const char *path) {
+ assert(d);
+
+ return set_put_strdup(&d->jobs, path);
+}
+
+int bus_wait_for_jobs_one(BusWaitForJobs *d, const char *path, bool quiet, const char* const* extra_args) {
+ int r;
+
+ r = bus_wait_for_jobs_add(d, path);
+ if (r < 0)
+ return log_oom();
+
+ return bus_wait_for_jobs(d, quiet, extra_args);
+}
diff --git a/src/shared/bus-wait-for-jobs.h b/src/shared/bus-wait-for-jobs.h
new file mode 100644
index 0000000..5acf8b9
--- /dev/null
+++ b/src/shared/bus-wait-for-jobs.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-bus.h"
+
+#include "macro.h"
+
+typedef struct BusWaitForJobs BusWaitForJobs;
+
+int bus_wait_for_jobs_new(sd_bus *bus, BusWaitForJobs **ret);
+BusWaitForJobs* bus_wait_for_jobs_free(BusWaitForJobs *d);
+int bus_wait_for_jobs_add(BusWaitForJobs *d, const char *path);
+int bus_wait_for_jobs(BusWaitForJobs *d, bool quiet, const char* const* extra_args);
+int bus_wait_for_jobs_one(BusWaitForJobs *d, const char *path, bool quiet, const char* const* extra_args);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(BusWaitForJobs*, bus_wait_for_jobs_free);
diff --git a/src/shared/bus-wait-for-units.c b/src/shared/bus-wait-for-units.c
new file mode 100644
index 0000000..0dd2a29
--- /dev/null
+++ b/src/shared/bus-wait-for-units.c
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "bus-error.h"
+#include "bus-map-properties.h"
+#include "bus-wait-for-units.h"
+#include "hashmap.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-def.h"
+
+typedef struct WaitForItem {
+ BusWaitForUnits *parent;
+
+ BusWaitForUnitsFlags flags;
+
+ char *bus_path;
+
+ sd_bus_slot *slot_get_all;
+ sd_bus_slot *slot_properties_changed;
+
+ bus_wait_for_units_unit_callback unit_callback;
+ void *userdata;
+
+ char *active_state;
+ uint32_t job_id;
+ char *clean_result;
+} WaitForItem;
+
+typedef struct BusWaitForUnits {
+ sd_bus *bus;
+ sd_bus_slot *slot_disconnected;
+
+ Hashmap *items;
+
+ bus_wait_for_units_ready_callback ready_callback;
+ void *userdata;
+
+ WaitForItem *current;
+
+ BusWaitForUnitsState state;
+ bool has_failed:1;
+} BusWaitForUnits;
+
+static WaitForItem *wait_for_item_free(WaitForItem *item) {
+ int r;
+
+ if (!item)
+ return NULL;
+
+ if (item->parent) {
+ if (FLAGS_SET(item->flags, BUS_WAIT_REFFED) && item->bus_path && item->parent->bus) {
+ r = sd_bus_call_method_async(
+ item->parent->bus,
+ NULL,
+ "org.freedesktop.systemd1",
+ item->bus_path,
+ "org.freedesktop.systemd1.Unit",
+ "Unref",
+ NULL,
+ NULL,
+ NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to drop reference to unit %s, ignoring: %m", item->bus_path);
+ }
+
+ assert_se(hashmap_remove(item->parent->items, item->bus_path) == item);
+
+ if (item->parent->current == item)
+ item->parent->current = NULL;
+ }
+
+ sd_bus_slot_unref(item->slot_properties_changed);
+ sd_bus_slot_unref(item->slot_get_all);
+
+ free(item->bus_path);
+ free(item->active_state);
+ free(item->clean_result);
+
+ return mfree(item);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(WaitForItem*, wait_for_item_free);
+
+static void call_unit_callback_and_wait(BusWaitForUnits *d, WaitForItem *item, bool good) {
+ d->current = item;
+
+ if (item->unit_callback)
+ item->unit_callback(d, item->bus_path, good, item->userdata);
+
+ wait_for_item_free(item);
+}
+
+static void bus_wait_for_units_clear(BusWaitForUnits *d) {
+ WaitForItem *item;
+
+ assert(d);
+
+ d->slot_disconnected = sd_bus_slot_unref(d->slot_disconnected);
+ d->bus = sd_bus_unref(d->bus);
+
+ while ((item = hashmap_first(d->items)))
+ call_unit_callback_and_wait(d, item, false);
+
+ d->items = hashmap_free(d->items);
+}
+
+static int match_disconnected(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+ BusWaitForUnits *d = ASSERT_PTR(userdata);
+
+ assert(m);
+
+ log_error("Warning! D-Bus connection terminated.");
+
+ bus_wait_for_units_clear(d);
+
+ if (d->ready_callback)
+ d->ready_callback(d, false, d->userdata);
+ else /* If no ready callback is specified close the connection so that the event loop exits */
+ sd_bus_close(sd_bus_message_get_bus(m));
+
+ return 0;
+}
+
+int bus_wait_for_units_new(sd_bus *bus, BusWaitForUnits **ret) {
+ _cleanup_(bus_wait_for_units_freep) BusWaitForUnits *d = NULL;
+ int r;
+
+ assert(bus);
+ assert(ret);
+
+ d = new(BusWaitForUnits, 1);
+ if (!d)
+ return -ENOMEM;
+
+ *d = (BusWaitForUnits) {
+ .state = BUS_WAIT_SUCCESS,
+ .bus = sd_bus_ref(bus),
+ };
+
+ r = sd_bus_match_signal_async(
+ bus,
+ &d->slot_disconnected,
+ "org.freedesktop.DBus.Local",
+ NULL,
+ "org.freedesktop.DBus.Local",
+ "Disconnected",
+ match_disconnected, NULL, d);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(d);
+ return 0;
+}
+
+BusWaitForUnits* bus_wait_for_units_free(BusWaitForUnits *d) {
+ if (!d)
+ return NULL;
+
+ bus_wait_for_units_clear(d);
+ sd_bus_slot_unref(d->slot_disconnected);
+ sd_bus_unref(d->bus);
+
+ return mfree(d);
+}
+
+static bool bus_wait_for_units_is_ready(BusWaitForUnits *d) {
+ assert(d);
+
+ if (!d->bus) /* Disconnected? */
+ return true;
+
+ return hashmap_isempty(d->items);
+}
+
+void bus_wait_for_units_set_ready_callback(BusWaitForUnits *d, bus_wait_for_units_ready_callback callback, void *userdata) {
+ assert(d);
+
+ d->ready_callback = callback;
+ d->userdata = userdata;
+}
+
+static void bus_wait_for_units_check_ready(BusWaitForUnits *d) {
+ assert(d);
+
+ if (!bus_wait_for_units_is_ready(d))
+ return;
+
+ d->state = d->has_failed ? BUS_WAIT_FAILURE : BUS_WAIT_SUCCESS;
+
+ if (d->ready_callback)
+ d->ready_callback(d, d->state, d->userdata);
+}
+
+static void wait_for_item_check_ready(WaitForItem *item) {
+ BusWaitForUnits *d;
+
+ assert(item);
+ assert_se(d = item->parent);
+
+ if (FLAGS_SET(item->flags, BUS_WAIT_FOR_MAINTENANCE_END)) {
+
+ if (item->clean_result && !streq(item->clean_result, "success"))
+ d->has_failed = true;
+
+ if (!item->active_state || streq(item->active_state, "maintenance"))
+ return;
+ }
+
+ if (FLAGS_SET(item->flags, BUS_WAIT_NO_JOB) && item->job_id != 0)
+ return;
+
+ if (FLAGS_SET(item->flags, BUS_WAIT_FOR_INACTIVE)) {
+
+ if (streq_ptr(item->active_state, "failed"))
+ d->has_failed = true;
+ else if (!streq_ptr(item->active_state, "inactive"))
+ return;
+ }
+
+ call_unit_callback_and_wait(d, item, true);
+ bus_wait_for_units_check_ready(d);
+}
+
+static int property_map_job(
+ sd_bus *bus,
+ const char *member,
+ sd_bus_message *m,
+ sd_bus_error *error,
+ void *userdata) {
+
+ WaitForItem *item = ASSERT_PTR(userdata);
+ const char *path;
+ uint32_t id;
+ int r;
+
+ r = sd_bus_message_read(m, "(uo)", &id, &path);
+ if (r < 0)
+ return r;
+
+ item->job_id = id;
+ return 0;
+}
+
+static int wait_for_item_parse_properties(WaitForItem *item, sd_bus_message *m) {
+
+ static const struct bus_properties_map map[] = {
+ { "ActiveState", "s", NULL, offsetof(WaitForItem, active_state) },
+ { "Job", "(uo)", property_map_job, 0 },
+ { "CleanResult", "s", NULL, offsetof(WaitForItem, clean_result) },
+ {}
+ };
+
+ int r;
+
+ assert(item);
+ assert(m);
+
+ r = bus_message_map_all_properties(m, map, BUS_MAP_STRDUP, NULL, item);
+ if (r < 0)
+ return r;
+
+ wait_for_item_check_ready(item);
+ return 0;
+}
+
+static int on_properties_changed(sd_bus_message *m, void *userdata, sd_bus_error *error) {
+ WaitForItem *item = ASSERT_PTR(userdata);
+ const char *interface;
+ int r;
+
+ r = sd_bus_message_read(m, "s", &interface);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to parse PropertiesChanged signal: %m");
+ return 0;
+ }
+
+ if (!streq(interface, "org.freedesktop.systemd1.Unit"))
+ return 0;
+
+ r = wait_for_item_parse_properties(item, m);
+ if (r < 0)
+ log_debug_errno(r, "Failed to process PropertiesChanged signal: %m");
+
+ return 0;
+}
+
+static int on_get_all_properties(sd_bus_message *m, void *userdata, sd_bus_error *ret_error) {
+ WaitForItem *item = ASSERT_PTR(userdata);
+ const sd_bus_error *e;
+ int r;
+
+ e = sd_bus_message_get_error(m);
+ if (e) {
+ BusWaitForUnits *d = item->parent;
+
+ d->has_failed = true;
+
+ r = sd_bus_error_get_errno(e);
+ log_debug_errno(r, "GetAll() failed for %s: %s",
+ item->bus_path, bus_error_message(e, r));
+
+ call_unit_callback_and_wait(d, item, false);
+ bus_wait_for_units_check_ready(d);
+ return 0;
+ }
+
+ r = wait_for_item_parse_properties(item, m);
+ if (r < 0)
+ log_debug_errno(r, "Failed to process GetAll method reply: %m");
+
+ return 0;
+}
+
+int bus_wait_for_units_add_unit(
+ BusWaitForUnits *d,
+ const char *unit,
+ BusWaitForUnitsFlags flags,
+ bus_wait_for_units_unit_callback callback,
+ void *userdata) {
+
+ _cleanup_(wait_for_item_freep) WaitForItem *item = NULL;
+ int r;
+
+ assert(d);
+ assert(unit);
+
+ assert(flags != 0);
+
+ r = hashmap_ensure_allocated(&d->items, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ item = new(WaitForItem, 1);
+ if (!item)
+ return -ENOMEM;
+
+ *item = (WaitForItem) {
+ .flags = flags,
+ .bus_path = unit_dbus_path_from_name(unit),
+ .unit_callback = callback,
+ .userdata = userdata,
+ .job_id = UINT32_MAX,
+ };
+
+ if (!item->bus_path)
+ return -ENOMEM;
+
+ if (!FLAGS_SET(item->flags, BUS_WAIT_REFFED)) {
+ r = sd_bus_call_method_async(
+ d->bus,
+ NULL,
+ "org.freedesktop.systemd1",
+ item->bus_path,
+ "org.freedesktop.systemd1.Unit",
+ "Ref",
+ NULL,
+ NULL,
+ NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add reference to unit %s: %m", unit);
+
+ item->flags |= BUS_WAIT_REFFED;
+ }
+
+ r = sd_bus_match_signal_async(
+ d->bus,
+ &item->slot_properties_changed,
+ "org.freedesktop.systemd1",
+ item->bus_path,
+ "org.freedesktop.DBus.Properties",
+ "PropertiesChanged",
+ on_properties_changed,
+ NULL,
+ item);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to request match for PropertiesChanged signal: %m");
+
+ r = sd_bus_call_method_async(
+ d->bus,
+ &item->slot_get_all,
+ "org.freedesktop.systemd1",
+ item->bus_path,
+ "org.freedesktop.DBus.Properties",
+ "GetAll",
+ on_get_all_properties,
+ item,
+ "s", FLAGS_SET(item->flags, BUS_WAIT_FOR_MAINTENANCE_END) ? NULL : "org.freedesktop.systemd1.Unit");
+ if (r < 0)
+ return log_debug_errno(r, "Failed to request properties of unit %s: %m", unit);
+
+ r = hashmap_put(d->items, item->bus_path, item);
+ if (r < 0)
+ return r;
+
+ d->state = BUS_WAIT_RUNNING;
+ item->parent = d;
+ TAKE_PTR(item);
+ return 0;
+}
+
+int bus_wait_for_units_run(BusWaitForUnits *d) {
+ int r;
+
+ assert(d);
+
+ while (d->state == BUS_WAIT_RUNNING) {
+
+ r = sd_bus_process(d->bus, NULL);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ continue;
+
+ r = sd_bus_wait(d->bus, UINT64_MAX);
+ if (r < 0)
+ return r;
+ }
+
+ return d->state;
+}
+
+BusWaitForUnitsState bus_wait_for_units_state(BusWaitForUnits *d) {
+ assert(d);
+
+ return d->state;
+}
diff --git a/src/shared/bus-wait-for-units.h b/src/shared/bus-wait-for-units.h
new file mode 100644
index 0000000..2623e72
--- /dev/null
+++ b/src/shared/bus-wait-for-units.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+#include "sd-bus.h"
+
+typedef struct BusWaitForUnits BusWaitForUnits;
+
+typedef enum BusWaitForUnitsState {
+ BUS_WAIT_SUCCESS, /* Nothing to wait for anymore and nothing failed */
+ BUS_WAIT_FAILURE, /* dito, but something failed */
+ BUS_WAIT_RUNNING, /* Still something to wait for */
+ _BUS_WAIT_FOR_UNITS_STATE_MAX,
+ _BUS_WAIT_FOR_UNITS_STATE_INVALID = -EINVAL,
+} BusWaitForUnitsState;
+
+typedef enum BusWaitForUnitsFlags {
+ BUS_WAIT_FOR_MAINTENANCE_END = 1 << 0, /* Wait until the unit is no longer in maintenance state */
+ BUS_WAIT_FOR_INACTIVE = 1 << 1, /* Wait until the unit is back in inactive or dead state */
+ BUS_WAIT_NO_JOB = 1 << 2, /* Wait until there's no more job pending */
+ BUS_WAIT_REFFED = 1 << 3, /* The unit is already reffed with RefUnit() */
+} BusWaitForUnitsFlags;
+
+typedef void (*bus_wait_for_units_ready_callback)(BusWaitForUnits *d, BusWaitForUnitsState state, void *userdata);
+typedef void (*bus_wait_for_units_unit_callback)(BusWaitForUnits *d, const char *unit_path, bool good, void *userdata);
+
+int bus_wait_for_units_new(sd_bus *bus, BusWaitForUnits **ret);
+BusWaitForUnits* bus_wait_for_units_free(BusWaitForUnits *d);
+
+BusWaitForUnitsState bus_wait_for_units_state(BusWaitForUnits *d);
+void bus_wait_for_units_set_ready_callback(BusWaitForUnits *d, bus_wait_for_units_ready_callback callback, void *userdata);
+int bus_wait_for_units_add_unit(BusWaitForUnits *d, const char *unit, BusWaitForUnitsFlags flags, bus_wait_for_units_unit_callback callback, void *userdata);
+int bus_wait_for_units_run(BusWaitForUnits *d);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(BusWaitForUnits*, bus_wait_for_units_free);
diff --git a/src/shared/calendarspec.c b/src/shared/calendarspec.c
new file mode 100644
index 0000000..039080f
--- /dev/null
+++ b/src/shared/calendarspec.c
@@ -0,0 +1,1435 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <ctype.h>
+#include <errno.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "calendarspec.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "macro.h"
+#include "memstream-util.h"
+#include "parse-util.h"
+#include "process-util.h"
+#include "sort-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "time-util.h"
+
+#define BITS_WEEKDAYS 127
+#define MIN_YEAR 1970
+#define MAX_YEAR 2199
+
+/* An arbitrary limit on the length of the chains of components. We don't want to
+ * build a very long linked list, which would be slow to iterate over and might cause
+ * our stack to overflow. It's unlikely that legitimate uses require more than a few
+ * linked components anyway. */
+#define CALENDARSPEC_COMPONENTS_MAX 240
+
+/* Let's make sure that the microsecond component is safe to be stored in an 'int' */
+assert_cc(INT_MAX >= USEC_PER_SEC);
+
+static CalendarComponent* chain_free(CalendarComponent *c) {
+ while (c) {
+ CalendarComponent *n = c->next;
+ free_and_replace(c, n);
+ }
+ return NULL;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(CalendarComponent*, chain_free);
+
+CalendarSpec* calendar_spec_free(CalendarSpec *c) {
+
+ if (!c)
+ return NULL;
+
+ chain_free(c->year);
+ chain_free(c->month);
+ chain_free(c->day);
+ chain_free(c->hour);
+ chain_free(c->minute);
+ chain_free(c->microsecond);
+ free(c->timezone);
+
+ return mfree(c);
+}
+
+static int component_compare(CalendarComponent * const *a, CalendarComponent * const *b) {
+ int r;
+
+ r = CMP((*a)->start, (*b)->start);
+ if (r != 0)
+ return r;
+
+ r = CMP((*a)->stop, (*b)->stop);
+ if (r != 0)
+ return r;
+
+ return CMP((*a)->repeat, (*b)->repeat);
+}
+
+static void normalize_chain(CalendarComponent **c) {
+ assert(c);
+
+ size_t n = 0;
+ for (CalendarComponent *i = *c; i; i = i->next) {
+ n++;
+
+ /* While we're counting the chain, also normalize 'stop'
+ * so the length of the range is a multiple of 'repeat'. */
+ if (i->stop > i->start && i->repeat > 0)
+ i->stop -= (i->stop - i->start) % i->repeat;
+
+ /* If a repeat value is specified, but it cannot even be triggered once, let's suppress it.
+ *
+ * Similarly, if the stop value is the same as the start value, then let's just make this a
+ * non-repeating chain element. */
+ if ((i->stop > i->start && i->repeat > 0 && i->start + i->repeat > i->stop) ||
+ i->start == i->stop) {
+ i->repeat = 0;
+ i->stop = -1;
+ }
+ }
+
+ if (n <= 1)
+ return;
+
+ CalendarComponent **b, **j;
+ b = j = newa(CalendarComponent*, n);
+ for (CalendarComponent *i = *c; i; i = i->next)
+ *(j++) = i;
+
+ typesafe_qsort(b, n, component_compare);
+
+ b[n-1]->next = NULL;
+ CalendarComponent *next = b[n-1];
+
+ /* Drop non-unique entries */
+ for (size_t k = n-1; k > 0; k--) {
+ if (component_compare(&b[k-1], &next) == 0) {
+ free(b[k-1]);
+ continue;
+ }
+
+ b[k-1]->next = next;
+ next = b[k-1];
+ }
+
+ *c = next;
+}
+
+static void fix_year(CalendarComponent *c) {
+ /* Turns 12 → 2012, 89 → 1989 */
+
+ while (c) {
+ if (c->start >= 0 && c->start < 70)
+ c->start += 2000;
+
+ if (c->stop >= 0 && c->stop < 70)
+ c->stop += 2000;
+
+ if (c->start >= 70 && c->start < 100)
+ c->start += 1900;
+
+ if (c->stop >= 70 && c->stop < 100)
+ c->stop += 1900;
+
+ c = c->next;
+ }
+}
+
+static void calendar_spec_normalize(CalendarSpec *c) {
+ assert(c);
+
+ if (streq_ptr(c->timezone, "UTC")) {
+ c->utc = true;
+ c->timezone = mfree(c->timezone);
+ }
+
+ if (c->weekdays_bits <= 0 || c->weekdays_bits >= BITS_WEEKDAYS)
+ c->weekdays_bits = -1;
+
+ if (c->end_of_month && !c->day)
+ c->end_of_month = false;
+
+ fix_year(c->year);
+
+ normalize_chain(&c->year);
+ normalize_chain(&c->month);
+ normalize_chain(&c->day);
+ normalize_chain(&c->hour);
+ normalize_chain(&c->minute);
+ normalize_chain(&c->microsecond);
+}
+
+static bool chain_valid(CalendarComponent *c, int from, int to, bool end_of_month) {
+ assert(to >= from);
+
+ if (!c)
+ return true;
+
+ /* Forbid dates more than 28 days from the end of the month */
+ if (end_of_month)
+ to -= 3;
+
+ if (c->start < from || c->start > to)
+ return false;
+
+ /* Avoid overly large values that could cause overflow */
+ if (c->repeat > to - from)
+ return false;
+
+ /*
+ * c->repeat must be short enough so at least one repetition may
+ * occur before the end of the interval. For dates scheduled
+ * relative to the end of the month, c->start and c->stop
+ * correspond to the Nth last day of the month.
+ */
+ if (c->stop >= 0) {
+ if (c->stop < from || c ->stop > to)
+ return false;
+
+ if (c->start + c->repeat > c->stop)
+ return false;
+ } else {
+ if (end_of_month && c->start - c->repeat < from)
+ return false;
+
+ if (!end_of_month && c->start + c->repeat > to)
+ return false;
+ }
+
+ if (c->next)
+ return chain_valid(c->next, from, to, end_of_month);
+
+ return true;
+}
+
+_pure_ bool calendar_spec_valid(CalendarSpec *c) {
+ assert(c);
+
+ if (c->weekdays_bits > BITS_WEEKDAYS)
+ return false;
+
+ if (!chain_valid(c->year, MIN_YEAR, MAX_YEAR, false))
+ return false;
+
+ if (!chain_valid(c->month, 1, 12, false))
+ return false;
+
+ if (!chain_valid(c->day, 1, 31, c->end_of_month))
+ return false;
+
+ if (!chain_valid(c->hour, 0, 23, false))
+ return false;
+
+ if (!chain_valid(c->minute, 0, 59, false))
+ return false;
+
+ if (!chain_valid(c->microsecond, 0, 60*USEC_PER_SEC-1, false))
+ return false;
+
+ return true;
+}
+
+static void format_weekdays(FILE *f, const CalendarSpec *c) {
+ static const char *const days[] = {
+ "Mon",
+ "Tue",
+ "Wed",
+ "Thu",
+ "Fri",
+ "Sat",
+ "Sun",
+ };
+
+ int l, x;
+ bool need_comma = false;
+
+ assert(f);
+ assert(c);
+ assert(c->weekdays_bits > 0 && c->weekdays_bits <= BITS_WEEKDAYS);
+
+ for (x = 0, l = -1; x < (int) ELEMENTSOF(days); x++) {
+
+ if (c->weekdays_bits & (1 << x)) {
+
+ if (l < 0) {
+ if (need_comma)
+ fputc(',', f);
+ else
+ need_comma = true;
+
+ fputs(days[x], f);
+ l = x;
+ }
+
+ } else if (l >= 0) {
+
+ if (x > l + 1) {
+ fputs(x > l + 2 ? ".." : ",", f);
+ fputs(days[x-1], f);
+ }
+
+ l = -1;
+ }
+ }
+
+ if (l >= 0 && x > l + 1) {
+ fputs(x > l + 2 ? ".." : ",", f);
+ fputs(days[x-1], f);
+ }
+}
+
+static bool chain_is_star(const CalendarComponent *c, bool usec) {
+ /* Return true if the whole chain can be replaced by '*'.
+ * This happens when the chain is empty or one of the components covers all. */
+ if (!c)
+ return true;
+ if (usec)
+ for (; c; c = c->next)
+ if (c->start == 0 && c->stop < 0 && c->repeat == USEC_PER_SEC)
+ return true;
+ return false;
+}
+
+static void _format_chain(FILE *f, int space, const CalendarComponent *c, bool start, bool usec) {
+ int d = usec ? (int) USEC_PER_SEC : 1;
+
+ assert(f);
+
+ if (start && chain_is_star(c, usec)) {
+ fputc('*', f);
+ return;
+ }
+
+ assert(c->start >= 0);
+
+ fprintf(f, "%0*i", space, c->start / d);
+ if (c->start % d > 0)
+ fprintf(f, ".%06i", c->start % d);
+
+ if (c->stop > 0)
+ fprintf(f, "..%0*i", space, c->stop / d);
+ if (c->stop % d > 0)
+ fprintf(f, ".%06i", c->stop % d);
+
+ if (c->repeat > 0 && !(c->stop > 0 && c->repeat == d))
+ fprintf(f, "/%i", c->repeat / d);
+ if (c->repeat % d > 0)
+ fprintf(f, ".%06i", c->repeat % d);
+
+ if (c->next) {
+ fputc(',', f);
+ _format_chain(f, space, c->next, false, usec);
+ }
+}
+
+static void format_chain(FILE *f, int space, const CalendarComponent *c, bool usec) {
+ _format_chain(f, space, c, /* start = */ true, usec);
+}
+
+int calendar_spec_to_string(const CalendarSpec *c, char **ret) {
+ _cleanup_(memstream_done) MemStream m = {};
+ FILE *f;
+
+ assert(c);
+ assert(ret);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ if (c->weekdays_bits > 0 && c->weekdays_bits <= BITS_WEEKDAYS) {
+ format_weekdays(f, c);
+ fputc(' ', f);
+ }
+
+ format_chain(f, 4, c->year, false);
+ fputc('-', f);
+ format_chain(f, 2, c->month, false);
+ fputc(c->end_of_month ? '~' : '-', f);
+ format_chain(f, 2, c->day, false);
+ fputc(' ', f);
+ format_chain(f, 2, c->hour, false);
+ fputc(':', f);
+ format_chain(f, 2, c->minute, false);
+ fputc(':', f);
+ format_chain(f, 2, c->microsecond, true);
+
+ if (c->utc)
+ fputs(" UTC", f);
+ else if (c->timezone) {
+ fputc(' ', f);
+ fputs(c->timezone, f);
+ } else if (IN_SET(c->dst, 0, 1)) {
+
+ /* If daylight saving is explicitly on or off, let's show the used timezone. */
+
+ tzset();
+
+ if (!isempty(tzname[c->dst])) {
+ fputc(' ', f);
+ fputs(tzname[c->dst], f);
+ }
+ }
+
+ return memstream_finalize(&m, ret, NULL);
+}
+
+static int parse_weekdays(const char **p, CalendarSpec *c) {
+ static const struct {
+ const char *name;
+ const int nr;
+ } day_nr[] = {
+ { "Monday", 0 },
+ { "Mon", 0 },
+ { "Tuesday", 1 },
+ { "Tue", 1 },
+ { "Wednesday", 2 },
+ { "Wed", 2 },
+ { "Thursday", 3 },
+ { "Thu", 3 },
+ { "Friday", 4 },
+ { "Fri", 4 },
+ { "Saturday", 5 },
+ { "Sat", 5 },
+ { "Sunday", 6 },
+ { "Sun", 6 },
+ };
+
+ int l = -1;
+ bool first = true;
+
+ assert(p);
+ assert(*p);
+ assert(c);
+
+ for (;;) {
+ size_t i;
+
+ for (i = 0; i < ELEMENTSOF(day_nr); i++) {
+ size_t skip;
+
+ if (!startswith_no_case(*p, day_nr[i].name))
+ continue;
+
+ skip = strlen(day_nr[i].name);
+
+ if (!IN_SET((*p)[skip], 0, '-', '.', ',', ' '))
+ return -EINVAL;
+
+ c->weekdays_bits |= 1 << day_nr[i].nr;
+
+ if (l >= 0) {
+ if (l > day_nr[i].nr)
+ return -EINVAL;
+
+ for (int j = l + 1; j < day_nr[i].nr; j++)
+ c->weekdays_bits |= 1 << j;
+ }
+
+ *p += skip;
+ break;
+ }
+
+ /* Couldn't find this prefix, so let's assume the
+ weekday was not specified and let's continue with
+ the date */
+ if (i >= ELEMENTSOF(day_nr))
+ return first ? 0 : -EINVAL;
+
+ /* We reached the end of the string */
+ if (**p == 0)
+ return 0;
+
+ /* We reached the end of the weekday spec part */
+ if (**p == ' ') {
+ *p += strspn(*p, " ");
+ return 0;
+ }
+
+ if (**p == '.') {
+ if (l >= 0)
+ return -EINVAL;
+
+ if ((*p)[1] != '.')
+ return -EINVAL;
+
+ l = day_nr[i].nr;
+ *p += 2;
+
+ /* Support ranges with "-" for backwards compatibility */
+ } else if (**p == '-') {
+ if (l >= 0)
+ return -EINVAL;
+
+ l = day_nr[i].nr;
+ *p += 1;
+
+ } else if (**p == ',') {
+ l = -1;
+ *p += 1;
+ }
+
+ /* Allow a trailing comma but not an open range */
+ if (IN_SET(**p, 0, ' ')) {
+ *p += strspn(*p, " ");
+ return l < 0 ? 0 : -EINVAL;
+ }
+
+ first = false;
+ }
+}
+
+static int parse_one_number(const char *p, const char **e, unsigned long *ret) {
+ char *ee = NULL;
+ unsigned long value;
+
+ errno = 0;
+ value = strtoul(p, &ee, 10);
+ if (errno > 0)
+ return -errno;
+ if (ee == p)
+ return -EINVAL;
+
+ *ret = value;
+ *e = ee;
+ return 0;
+}
+
+static int parse_component_decimal(const char **p, bool usec, int *res) {
+ unsigned long value;
+ const char *e = NULL;
+ int r;
+
+ if (!ascii_isdigit(**p))
+ return -EINVAL;
+
+ r = parse_one_number(*p, &e, &value);
+ if (r < 0)
+ return r;
+
+ if (usec) {
+ if (value * USEC_PER_SEC / USEC_PER_SEC != value)
+ return -ERANGE;
+
+ value *= USEC_PER_SEC;
+
+ /* One "." is a decimal point, but ".." is a range separator */
+ if (e[0] == '.' && e[1] != '.') {
+ unsigned add;
+
+ e++;
+ r = parse_fractional_part_u(&e, 6, &add);
+ if (r < 0)
+ return r;
+
+ if (add + value < value)
+ return -ERANGE;
+ value += add;
+ }
+ }
+
+ if (value > INT_MAX)
+ return -ERANGE;
+
+ *p = e;
+ *res = value;
+
+ return 0;
+}
+
+static int const_chain(int value, CalendarComponent **c) {
+ CalendarComponent *cc = NULL;
+
+ assert(c);
+
+ cc = new(CalendarComponent, 1);
+ if (!cc)
+ return -ENOMEM;
+
+ *cc = (CalendarComponent) {
+ .start = value,
+ .stop = -1,
+ .repeat = 0,
+ .next = *c,
+ };
+
+ *c = cc;
+
+ return 0;
+}
+
+static int calendarspec_from_time_t(CalendarSpec *c, time_t time) {
+ _cleanup_(chain_freep) CalendarComponent
+ *year = NULL, *month = NULL, *day = NULL,
+ *hour = NULL, *minute = NULL, *us = NULL;
+ struct tm tm;
+ int r;
+
+ if (!gmtime_r(&time, &tm))
+ return -ERANGE;
+
+ if (tm.tm_year > INT_MAX - 1900)
+ return -ERANGE;
+
+ r = const_chain(tm.tm_year + 1900, &year);
+ if (r < 0)
+ return r;
+
+ r = const_chain(tm.tm_mon + 1, &month);
+ if (r < 0)
+ return r;
+
+ r = const_chain(tm.tm_mday, &day);
+ if (r < 0)
+ return r;
+
+ r = const_chain(tm.tm_hour, &hour);
+ if (r < 0)
+ return r;
+
+ r = const_chain(tm.tm_min, &minute);
+ if (r < 0)
+ return r;
+
+ r = const_chain(tm.tm_sec * USEC_PER_SEC, &us);
+ if (r < 0)
+ return r;
+
+ c->utc = true;
+ c->year = TAKE_PTR(year);
+ c->month = TAKE_PTR(month);
+ c->day = TAKE_PTR(day);
+ c->hour = TAKE_PTR(hour);
+ c->minute = TAKE_PTR(minute);
+ c->microsecond = TAKE_PTR(us);
+ return 0;
+}
+
+static int prepend_component(const char **p, bool usec, unsigned nesting, CalendarComponent **c) {
+ int r, start, stop = -1, repeat = 0;
+ CalendarComponent *cc;
+ const char *e = *p;
+
+ assert(p);
+ assert(c);
+
+ if (nesting > CALENDARSPEC_COMPONENTS_MAX)
+ return -ENOBUFS;
+
+ r = parse_component_decimal(&e, usec, &start);
+ if (r < 0)
+ return r;
+
+ if (e[0] == '.' && e[1] == '.') {
+ e += 2;
+ r = parse_component_decimal(&e, usec, &stop);
+ if (r < 0)
+ return r;
+
+ repeat = usec ? USEC_PER_SEC : 1;
+ }
+
+ if (*e == '/') {
+ e++;
+ r = parse_component_decimal(&e, usec, &repeat);
+ if (r < 0)
+ return r;
+
+ if (repeat == 0)
+ return -ERANGE;
+ } else {
+ /* If no repeat value is specified for the μs component, then let's explicitly refuse ranges
+ * below 1s because our default repeat granularity is beyond that. */
+
+ /* Overflow check */
+ if (start > INT_MAX - repeat)
+ return -ERANGE;
+
+ if (usec && stop >= 0 && start + repeat > stop)
+ return -EINVAL;
+ }
+
+ if (!IN_SET(*e, 0, ' ', ',', '-', '~', ':'))
+ return -EINVAL;
+
+ cc = new(CalendarComponent, 1);
+ if (!cc)
+ return -ENOMEM;
+
+ *cc = (CalendarComponent) {
+ .start = start,
+ .stop = stop,
+ .repeat = repeat,
+ .next = *c,
+ };
+
+ *p = e;
+ *c = cc;
+
+ if (*e ==',') {
+ *p += 1;
+ return prepend_component(p, usec, nesting + 1, c);
+ }
+
+ return 0;
+}
+
+static int parse_chain(const char **p, bool usec, CalendarComponent **c) {
+ _cleanup_(chain_freep) CalendarComponent *cc = NULL;
+ const char *t;
+ int r;
+
+ assert(p);
+ assert(c);
+
+ t = *p;
+
+ if (t[0] == '*') {
+ if (usec) {
+ r = const_chain(0, c);
+ if (r < 0)
+ return r;
+ (*c)->repeat = USEC_PER_SEC;
+ } else
+ *c = NULL;
+
+ *p = t + 1;
+ return 0;
+ }
+
+ r = prepend_component(&t, usec, 0, &cc);
+ if (r < 0)
+ return r;
+
+ *p = t;
+ *c = TAKE_PTR(cc);
+ return 0;
+}
+
+static int parse_date(const char **p, CalendarSpec *c) {
+ _cleanup_(chain_freep) CalendarComponent *first = NULL, *second = NULL, *third = NULL;
+ const char *t;
+ int r;
+
+ assert(p);
+ assert(*p);
+ assert(c);
+
+ t = *p;
+
+ if (*t == 0)
+ return 0;
+
+ /* @TIMESTAMP — UNIX time in seconds since the epoch */
+ if (*t == '@') {
+ unsigned long value;
+ time_t time;
+
+ r = parse_one_number(t + 1, &t, &value);
+ if (r < 0)
+ return r;
+
+ time = value;
+ if ((unsigned long) time != value)
+ return -ERANGE;
+
+ r = calendarspec_from_time_t(c, time);
+ if (r < 0)
+ return r;
+
+ *p = t;
+ return 1; /* finito, don't parse H:M:S after that */
+ }
+
+ r = parse_chain(&t, false, &first);
+ if (r < 0)
+ return r;
+
+ /* Already the end? A ':' as separator? In that case this was a time, not a date */
+ if (IN_SET(*t, 0, ':'))
+ return 0;
+
+ if (*t == '~')
+ c->end_of_month = true;
+ else if (*t != '-')
+ return -EINVAL;
+
+ t++;
+ r = parse_chain(&t, false, &second);
+ if (r < 0)
+ return r;
+
+ /* Got two parts, hence it's month and day */
+ if (IN_SET(*t, 0, ' ')) {
+ *p = t + strspn(t, " ");
+ c->month = TAKE_PTR(first);
+ c->day = TAKE_PTR(second);
+ return 0;
+ } else if (c->end_of_month)
+ return -EINVAL;
+
+ if (*t == '~')
+ c->end_of_month = true;
+ else if (*t != '-')
+ return -EINVAL;
+
+ t++;
+ r = parse_chain(&t, false, &third);
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(*t, 0, ' '))
+ return -EINVAL;
+
+ /* Got three parts, hence it is year, month and day */
+ *p = t + strspn(t, " ");
+ c->year = TAKE_PTR(first);
+ c->month = TAKE_PTR(second);
+ c->day = TAKE_PTR(third);
+ return 0;
+}
+
+static int parse_calendar_time(const char **p, CalendarSpec *c) {
+ _cleanup_(chain_freep) CalendarComponent *h = NULL, *m = NULL, *s = NULL;
+ const char *t;
+ int r;
+
+ assert(p);
+ assert(*p);
+ assert(c);
+
+ t = *p;
+
+ /* If no time is specified at all, then this means 00:00:00 */
+ if (*t == 0)
+ goto null_hour;
+
+ r = parse_chain(&t, false, &h);
+ if (r < 0)
+ return r;
+
+ if (*t != ':')
+ return -EINVAL;
+
+ t++;
+ r = parse_chain(&t, false, &m);
+ if (r < 0)
+ return r;
+
+ /* Already at the end? Then it's hours and minutes, and seconds are 0 */
+ if (*t == 0)
+ goto null_second;
+
+ if (*t != ':')
+ return -EINVAL;
+
+ t++;
+ r = parse_chain(&t, true, &s);
+ if (r < 0)
+ return r;
+
+ /* At the end? Then it's hours, minutes and seconds */
+ if (*t == 0)
+ goto finish;
+
+ return -EINVAL;
+
+null_hour:
+ r = const_chain(0, &h);
+ if (r < 0)
+ return r;
+
+ r = const_chain(0, &m);
+ if (r < 0)
+ return r;
+
+null_second:
+ r = const_chain(0, &s);
+ if (r < 0)
+ return r;
+
+finish:
+ *p = t;
+ c->hour = TAKE_PTR(h);
+ c->minute = TAKE_PTR(m);
+ c->microsecond = TAKE_PTR(s);
+
+ return 0;
+}
+
+int calendar_spec_from_string(const char *p, CalendarSpec **ret) {
+ const char *utc;
+ _cleanup_(calendar_spec_freep) CalendarSpec *c = NULL;
+ _cleanup_free_ char *p_tmp = NULL;
+ int r;
+
+ assert(p);
+
+ c = new(CalendarSpec, 1);
+ if (!c)
+ return -ENOMEM;
+
+ *c = (CalendarSpec) {
+ .dst = -1,
+ .timezone = NULL,
+ };
+
+ utc = endswith_no_case(p, " UTC");
+ if (utc) {
+ c->utc = true;
+ p = p_tmp = strndup(p, utc - p);
+ if (!p)
+ return -ENOMEM;
+ } else {
+ const char *e = NULL;
+ int j;
+
+ tzset();
+
+ /* Check if the local timezone was specified? */
+ for (j = 0; j <= 1; j++) {
+ if (isempty(tzname[j]))
+ continue;
+
+ e = endswith_no_case(p, tzname[j]);
+ if (!e)
+ continue;
+ if (e == p)
+ continue;
+ if (e[-1] != ' ')
+ continue;
+
+ break;
+ }
+
+ /* Found one of the two timezones specified? */
+ if (IN_SET(j, 0, 1)) {
+ p = p_tmp = strndup(p, e - p - 1);
+ if (!p)
+ return -ENOMEM;
+
+ c->dst = j;
+ } else {
+ const char *last_space;
+
+ last_space = strrchr(p, ' ');
+ if (last_space != NULL && timezone_is_valid(last_space + 1, LOG_DEBUG)) {
+ c->timezone = strdup(last_space + 1);
+ if (!c->timezone)
+ return -ENOMEM;
+
+ p = p_tmp = strndup(p, last_space - p);
+ if (!p)
+ return -ENOMEM;
+ }
+ }
+ }
+
+ if (isempty(p))
+ return -EINVAL;
+
+ if (strcaseeq(p, "minutely")) {
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (strcaseeq(p, "hourly")) {
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (strcaseeq(p, "daily")) {
+ r = const_chain(0, &c->hour);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (strcaseeq(p, "monthly")) {
+ r = const_chain(1, &c->day);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->hour);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (STRCASE_IN_SET(p,
+ "annually",
+ "yearly",
+ "anually") /* backwards compatibility */ ) {
+
+ r = const_chain(1, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(1, &c->day);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->hour);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (strcaseeq(p, "weekly")) {
+
+ c->weekdays_bits = 1;
+
+ r = const_chain(0, &c->hour);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (strcaseeq(p, "quarterly")) {
+
+ r = const_chain(1, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(4, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(7, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(10, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(1, &c->day);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->hour);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else if (STRCASE_IN_SET(p,
+ "biannually",
+ "bi-annually",
+ "semiannually",
+ "semi-annually")) {
+
+ r = const_chain(1, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(7, &c->month);
+ if (r < 0)
+ return r;
+ r = const_chain(1, &c->day);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->hour);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->minute);
+ if (r < 0)
+ return r;
+ r = const_chain(0, &c->microsecond);
+ if (r < 0)
+ return r;
+
+ } else {
+ r = parse_weekdays(&p, c);
+ if (r < 0)
+ return r;
+
+ r = parse_date(&p, c);
+ if (r < 0)
+ return r;
+
+ if (r == 0) {
+ r = parse_calendar_time(&p, c);
+ if (r < 0)
+ return r;
+ }
+
+ if (*p != 0)
+ return -EINVAL;
+ }
+
+ calendar_spec_normalize(c);
+
+ if (!calendar_spec_valid(c))
+ return -EINVAL;
+
+ if (ret)
+ *ret = TAKE_PTR(c);
+ return 0;
+}
+
+static int find_end_of_month(const struct tm *tm, bool utc, int day) {
+ struct tm t = *tm;
+
+ t.tm_mon++;
+ t.tm_mday = 1 - day;
+
+ if (mktime_or_timegm(&t, utc) < 0 ||
+ t.tm_mon != tm->tm_mon)
+ return -1;
+
+ return t.tm_mday;
+}
+
+static int find_matching_component(
+ const CalendarSpec *spec,
+ const CalendarComponent *c,
+ const struct tm *tm, /* tm is only used for end-of-month calculations */
+ int *val) {
+
+ int d = -1, r;
+ bool d_set = false;
+
+ assert(val);
+
+ /* Finds the *earliest* matching time specified by one of the CalendarCompoment items in chain c.
+ * If no matches can be found, returns -ENOENT.
+ * Otherwise, updates *val to the matching time. 1 is returned if *val was changed, 0 otherwise.
+ */
+
+ if (!c)
+ return 0;
+
+ bool end_of_month = spec->end_of_month && c == spec->day;
+
+ while (c) {
+ int start, stop;
+
+ if (end_of_month) {
+ start = find_end_of_month(tm, spec->utc, c->start);
+ stop = find_end_of_month(tm, spec->utc, c->stop);
+
+ if (stop > 0)
+ SWAP_TWO(start, stop);
+ } else {
+ start = c->start;
+ stop = c->stop;
+ }
+
+ if (start >= *val) {
+
+ if (!d_set || start < d) {
+ d = start;
+ d_set = true;
+ }
+
+ } else if (c->repeat > 0) {
+ int k;
+
+ k = start + ROUND_UP(*val - start, c->repeat);
+
+ if ((!d_set || k < d) && (stop < 0 || k <= stop)) {
+ d = k;
+ d_set = true;
+ }
+ }
+
+ c = c->next;
+ }
+
+ if (!d_set)
+ return -ENOENT;
+
+ r = *val != d;
+ *val = d;
+ return r;
+}
+
+static int tm_within_bounds(struct tm *tm, bool utc) {
+ struct tm t;
+ int cmp;
+ assert(tm);
+
+ /*
+ * Set an upper bound on the year so impossible dates like "*-02-31"
+ * don't cause find_next() to loop forever. tm_year contains years
+ * since 1900, so adjust it accordingly.
+ */
+ if (tm->tm_year + 1900 > MAX_YEAR)
+ return -ERANGE;
+
+ t = *tm;
+ if (mktime_or_timegm(&t, utc) < 0)
+ return negative_errno();
+
+ /*
+ * Did any normalization take place? If so, it was out of bounds before.
+ * Normalization could skip next elapse, e.g. result of normalizing 3-33
+ * is 4-2. This skips 4-1. So reset the sub time unit if upper unit was
+ * out of bounds. Normalization has occurred implies find_matching_component() > 0,
+ * other sub time units are already reset in find_next().
+ */
+ if ((cmp = CMP(t.tm_year, tm->tm_year)) != 0)
+ t.tm_mon = 0;
+ else if ((cmp = CMP(t.tm_mon, tm->tm_mon)) != 0)
+ t.tm_mday = 1;
+ else if ((cmp = CMP(t.tm_mday, tm->tm_mday)) != 0)
+ t.tm_hour = 0;
+ else if ((cmp = CMP(t.tm_hour, tm->tm_hour)) != 0)
+ t.tm_min = 0;
+ else if ((cmp = CMP(t.tm_min, tm->tm_min)) != 0)
+ t.tm_sec = 0;
+ else
+ cmp = CMP(t.tm_sec, tm->tm_sec);
+
+ if (cmp < 0)
+ return -EDEADLK; /* Refuse to go backward */
+ if (cmp > 0)
+ *tm = t;
+ return cmp == 0;
+}
+
+static bool matches_weekday(int weekdays_bits, const struct tm *tm, bool utc) {
+ struct tm t;
+ int k;
+
+ if (weekdays_bits < 0 || weekdays_bits >= BITS_WEEKDAYS)
+ return true;
+
+ t = *tm;
+ if (mktime_or_timegm(&t, utc) < 0)
+ return false;
+
+ k = t.tm_wday == 0 ? 6 : t.tm_wday - 1;
+ return (weekdays_bits & (1 << k));
+}
+
+/* A safety valve: if we get stuck in the calculation, return an error.
+ * C.f. https://bugzilla.redhat.com/show_bug.cgi?id=1941335. */
+#define MAX_CALENDAR_ITERATIONS 1000
+
+static int find_next(const CalendarSpec *spec, struct tm *tm, usec_t *usec) {
+ struct tm c;
+ int tm_usec;
+ int r;
+
+ /* Returns -ENOENT if the expression is not going to elapse anymore */
+
+ assert(spec);
+ assert(tm);
+
+ c = *tm;
+ tm_usec = *usec;
+
+ for (unsigned iteration = 0; iteration < MAX_CALENDAR_ITERATIONS; iteration++) {
+ /* Normalize the current date */
+ (void) mktime_or_timegm(&c, spec->utc);
+ c.tm_isdst = spec->dst;
+
+ c.tm_year += 1900;
+ r = find_matching_component(spec, spec->year, &c, &c.tm_year);
+ c.tm_year -= 1900;
+
+ if (r > 0) {
+ c.tm_mon = 0;
+ c.tm_mday = 1;
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ }
+ if (r < 0)
+ return r;
+ if (tm_within_bounds(&c, spec->utc) <= 0)
+ return -ENOENT;
+
+ c.tm_mon += 1;
+ r = find_matching_component(spec, spec->month, &c, &c.tm_mon);
+ c.tm_mon -= 1;
+
+ if (r > 0) {
+ c.tm_mday = 1;
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ }
+ if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) {
+ c.tm_year++;
+ c.tm_mon = 0;
+ c.tm_mday = 1;
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ continue;
+ }
+ if (r == 0)
+ continue;
+
+ r = find_matching_component(spec, spec->day, &c, &c.tm_mday);
+ if (r > 0)
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) {
+ c.tm_mon++;
+ c.tm_mday = 1;
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ continue;
+ }
+ if (r == 0)
+ continue;
+
+ if (!matches_weekday(spec->weekdays_bits, &c, spec->utc)) {
+ c.tm_mday++;
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ continue;
+ }
+
+ r = find_matching_component(spec, spec->hour, &c, &c.tm_hour);
+ if (r > 0)
+ c.tm_min = c.tm_sec = tm_usec = 0;
+ if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) {
+ c.tm_mday++;
+ c.tm_hour = c.tm_min = c.tm_sec = tm_usec = 0;
+ continue;
+ }
+ if (r == 0)
+ /* The next hour we set might be missing if there
+ * are time zone changes. Let's try again starting at
+ * normalized time. */
+ continue;
+
+ r = find_matching_component(spec, spec->minute, &c, &c.tm_min);
+ if (r > 0)
+ c.tm_sec = tm_usec = 0;
+ if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) {
+ c.tm_hour++;
+ c.tm_min = c.tm_sec = tm_usec = 0;
+ continue;
+ }
+ if (r == 0)
+ continue;
+
+ c.tm_sec = c.tm_sec * USEC_PER_SEC + tm_usec;
+ r = find_matching_component(spec, spec->microsecond, &c, &c.tm_sec);
+ tm_usec = c.tm_sec % USEC_PER_SEC;
+ c.tm_sec /= USEC_PER_SEC;
+
+ if (r < 0 || (r = tm_within_bounds(&c, spec->utc)) < 0) {
+ c.tm_min++;
+ c.tm_sec = tm_usec = 0;
+ continue;
+ }
+ if (r == 0)
+ continue;
+
+ *tm = c;
+ *usec = tm_usec;
+ return 0;
+ }
+
+ /* It seems we entered an infinite loop. Let's gracefully return an error instead of hanging or
+ * aborting. This code is also exercised when timers.target is brought up during early boot, so
+ * aborting here is problematic and hard to diagnose for users. */
+ _cleanup_free_ char *s = NULL;
+ (void) calendar_spec_to_string(spec, &s);
+ return log_warning_errno(SYNTHETIC_ERRNO(EDEADLK),
+ "Infinite loop in calendar calculation: %s", strna(s));
+}
+
+static int calendar_spec_next_usec_impl(const CalendarSpec *spec, usec_t usec, usec_t *ret_next) {
+ struct tm tm;
+ time_t t;
+ int r;
+ usec_t tm_usec;
+
+ assert(spec);
+
+ if (usec > USEC_TIMESTAMP_FORMATTABLE_MAX)
+ return -EINVAL;
+
+ usec++;
+ t = (time_t) (usec / USEC_PER_SEC);
+ assert_se(localtime_or_gmtime_r(&t, &tm, spec->utc));
+ tm_usec = usec % USEC_PER_SEC;
+
+ r = find_next(spec, &tm, &tm_usec);
+ if (r < 0)
+ return r;
+
+ t = mktime_or_timegm(&tm, spec->utc);
+ if (t < 0)
+ return -EINVAL;
+
+ if (ret_next)
+ *ret_next = (usec_t) t * USEC_PER_SEC + tm_usec;
+
+ return 0;
+}
+
+typedef struct SpecNextResult {
+ usec_t next;
+ int return_value;
+} SpecNextResult;
+
+int calendar_spec_next_usec(const CalendarSpec *spec, usec_t usec, usec_t *ret_next) {
+ SpecNextResult *shared, tmp;
+ int r;
+
+ assert(spec);
+
+ if (isempty(spec->timezone))
+ return calendar_spec_next_usec_impl(spec, usec, ret_next);
+
+ shared = mmap(NULL, sizeof *shared, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
+ if (shared == MAP_FAILED)
+ return negative_errno();
+
+ r = safe_fork("(sd-calendar)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGKILL|FORK_WAIT, NULL);
+ if (r < 0) {
+ (void) munmap(shared, sizeof *shared);
+ return r;
+ }
+ if (r == 0) {
+ char *colon_tz;
+
+ /* tzset(3) says $TZ should be prefixed with ":" if we reference timezone files */
+ colon_tz = strjoina(":", spec->timezone);
+
+ if (setenv("TZ", colon_tz, 1) != 0) {
+ shared->return_value = negative_errno();
+ _exit(EXIT_FAILURE);
+ }
+
+ tzset();
+
+ shared->return_value = calendar_spec_next_usec_impl(spec, usec, &shared->next);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ tmp = *shared;
+ if (munmap(shared, sizeof *shared) < 0)
+ return negative_errno();
+
+ if (tmp.return_value == 0 && ret_next)
+ *ret_next = tmp.next;
+
+ return tmp.return_value;
+}
diff --git a/src/shared/calendarspec.h b/src/shared/calendarspec.h
new file mode 100644
index 0000000..60c1c79
--- /dev/null
+++ b/src/shared/calendarspec.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* A structure for specifying (possibly repetitive) points in calendar
+ * time, a la cron */
+
+#include <stdbool.h>
+
+#include "time-util.h"
+
+typedef struct CalendarComponent {
+ int start;
+ int stop;
+ int repeat;
+
+ struct CalendarComponent *next;
+} CalendarComponent;
+
+typedef struct CalendarSpec {
+ int weekdays_bits;
+ bool end_of_month:1;
+ bool utc:1;
+ signed int dst:2;
+ char *timezone;
+
+ CalendarComponent *year;
+ CalendarComponent *month;
+ CalendarComponent *day;
+
+ CalendarComponent *hour;
+ CalendarComponent *minute;
+ CalendarComponent *microsecond;
+} CalendarSpec;
+
+CalendarSpec* calendar_spec_free(CalendarSpec *c);
+
+bool calendar_spec_valid(CalendarSpec *spec);
+
+int calendar_spec_to_string(const CalendarSpec *spec, char **ret);
+int calendar_spec_from_string(const char *p, CalendarSpec **ret);
+
+int calendar_spec_next_usec(const CalendarSpec *spec, usec_t usec, usec_t *next);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(CalendarSpec*, calendar_spec_free);
diff --git a/src/shared/cgroup-setup.c b/src/shared/cgroup-setup.c
new file mode 100644
index 0000000..934a16e
--- /dev/null
+++ b/src/shared/cgroup-setup.c
@@ -0,0 +1,1008 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "missing_threads.h"
+#include "mkdir.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "recurse-dir.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "user-util.h"
+#include "virt.h"
+
+static int cg_any_controller_used_for_v1(void) {
+ _cleanup_free_ char *buf = NULL;
+ _cleanup_strv_free_ char **lines = NULL;
+ int r;
+
+ r = read_full_virtual_file("/proc/cgroups", &buf, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Could not read /proc/cgroups, ignoring: %m");
+
+ r = strv_split_newlines_full(&lines, buf, 0);
+ if (r < 0)
+ return r;
+
+ /* The intention of this is to check if the fully unified cgroup tree setup is possible, meaning all
+ * enabled kernel cgroup controllers are currently not in use by cgroup1. For reference:
+ * https://systemd.io/CGROUP_DELEGATION/#three-different-tree-setups-
+ *
+ * Note that this is typically only useful to check inside a container where we don't know what
+ * cgroup tree setup is in use by the host; if the host is using legacy or hybrid, we can't use
+ * unified since some or all controllers would be missing. This is not the best way to detect this,
+ * as whatever container manager created our container should have mounted /sys/fs/cgroup
+ * appropriately, but in case that wasn't done, we try to detect if it's possible for us to use
+ * unified cgroups. */
+ STRV_FOREACH(line, lines) {
+ _cleanup_free_ char *name = NULL, *hierarchy_id = NULL, *num = NULL, *enabled = NULL;
+
+ /* Skip header line */
+ if (startswith(*line, "#"))
+ continue;
+
+ const char *p = *line;
+ r = extract_many_words(&p, NULL, 0, &name, &hierarchy_id, &num, &enabled, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Error parsing /proc/cgroups line, ignoring: %m");
+ else if (r < 4) {
+ log_debug("Invalid /proc/cgroups line, ignoring.");
+ continue;
+ }
+
+ /* Ignore disabled controllers. */
+ if (streq(enabled, "0"))
+ continue;
+
+ /* Ignore controllers we don't care about. */
+ if (cgroup_controller_from_string(name) < 0)
+ continue;
+
+ /* Since the unified cgroup doesn't use multiple hierarchies, if any controller has a
+ * non-zero hierarchy_id that means it's in use already in a legacy (or hybrid) cgroup v1
+ * hierarchy, and can't be used in a unified cgroup. */
+ if (!streq(hierarchy_id, "0")) {
+ log_debug("Cgroup controller %s in use by legacy v1 hierarchy.", name);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+bool cg_is_unified_wanted(void) {
+ static thread_local int wanted = -1;
+ bool b;
+ const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
+ _cleanup_free_ char *c = NULL;
+ int r;
+
+ /* If we have a cached value, return that. */
+ if (wanted >= 0)
+ return wanted;
+
+ /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
+ r = cg_unified_cached(true);
+ if (r >= 0)
+ return (wanted = r >= CGROUP_UNIFIED_ALL);
+
+ /* If we were explicitly passed systemd.unified_cgroup_hierarchy, respect that. */
+ r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", /* flags = */ 0, &b);
+ if (r > 0)
+ return (wanted = b);
+
+ /* If we passed cgroup_no_v1=all with no other instructions, it seems highly unlikely that we want to
+ * use hybrid or legacy hierarchy. */
+ r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
+ if (r > 0 && streq_ptr(c, "all"))
+ return (wanted = true);
+
+ /* If any controller is in use as v1, don't use unified. */
+ if (cg_any_controller_used_for_v1() > 0)
+ return (wanted = false);
+
+ return (wanted = is_default);
+}
+
+bool cg_is_legacy_wanted(void) {
+ static thread_local int wanted = -1;
+
+ /* If we have a cached value, return that. */
+ if (wanted >= 0)
+ return wanted;
+
+ /* Check if we have cgroup v2 already mounted. */
+ if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
+ return (wanted = false);
+
+ /* Otherwise, assume that at least partial legacy is wanted,
+ * since cgroup v2 should already be mounted at this point. */
+ return (wanted = true);
+}
+
+bool cg_is_hybrid_wanted(void) {
+ static thread_local int wanted = -1;
+ int r;
+ bool b;
+ const bool is_default = DEFAULT_HIERARCHY >= CGROUP_UNIFIED_SYSTEMD;
+ /* We default to true if the default is "hybrid", obviously, but also when the default is "unified",
+ * because if we get called, it means that unified hierarchy was not mounted. */
+
+ /* If we have a cached value, return that. */
+ if (wanted >= 0)
+ return wanted;
+
+ /* If the hierarchy is already mounted, then follow whatever was chosen for it. */
+ if (cg_unified_cached(true) == CGROUP_UNIFIED_ALL)
+ return (wanted = false);
+
+ /* Otherwise, let's see what the kernel command line has to say. Since checking is expensive, cache
+ * a non-error result. */
+ r = proc_cmdline_get_bool("systemd.legacy_systemd_cgroup_controller", /* flags = */ 0, &b);
+
+ /* The meaning of the kernel option is reversed wrt. to the return value of this function, hence the
+ * negation. */
+ return (wanted = r > 0 ? !b : is_default);
+}
+
+int cg_weight_parse(const char *s, uint64_t *ret) {
+ uint64_t u;
+ int r;
+
+ if (isempty(s)) {
+ *ret = CGROUP_WEIGHT_INVALID;
+ return 0;
+ }
+
+ r = safe_atou64(s, &u);
+ if (r < 0)
+ return r;
+
+ if (u < CGROUP_WEIGHT_MIN || u > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ *ret = u;
+ return 0;
+}
+
+int cg_cpu_weight_parse(const char *s, uint64_t *ret) {
+ if (streq_ptr(s, "idle"))
+ return *ret = CGROUP_WEIGHT_IDLE;
+ return cg_weight_parse(s, ret);
+}
+
+int cg_cpu_shares_parse(const char *s, uint64_t *ret) {
+ uint64_t u;
+ int r;
+
+ if (isempty(s)) {
+ *ret = CGROUP_CPU_SHARES_INVALID;
+ return 0;
+ }
+
+ r = safe_atou64(s, &u);
+ if (r < 0)
+ return r;
+
+ if (u < CGROUP_CPU_SHARES_MIN || u > CGROUP_CPU_SHARES_MAX)
+ return -ERANGE;
+
+ *ret = u;
+ return 0;
+}
+
+int cg_blkio_weight_parse(const char *s, uint64_t *ret) {
+ uint64_t u;
+ int r;
+
+ if (isempty(s)) {
+ *ret = CGROUP_BLKIO_WEIGHT_INVALID;
+ return 0;
+ }
+
+ r = safe_atou64(s, &u);
+ if (r < 0)
+ return r;
+
+ if (u < CGROUP_BLKIO_WEIGHT_MIN || u > CGROUP_BLKIO_WEIGHT_MAX)
+ return -ERANGE;
+
+ *ret = u;
+ return 0;
+}
+
+static int trim_cb(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ /* Failures to delete inner cgroup we ignore (but debug log in case error code is unexpected) */
+ if (event == RECURSE_DIR_LEAVE &&
+ de->d_type == DT_DIR &&
+ unlinkat(dir_fd, de->d_name, AT_REMOVEDIR) < 0 &&
+ !IN_SET(errno, ENOENT, ENOTEMPTY, EBUSY))
+ log_debug_errno(errno, "Failed to trim inner cgroup %s, ignoring: %m", path);
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+int cg_trim(const char *controller, const char *path, bool delete_root) {
+ _cleanup_free_ char *fs = NULL;
+ int r, q;
+
+ assert(path);
+ assert(controller);
+
+ r = cg_get_path(controller, path, NULL, &fs);
+ if (r < 0)
+ return r;
+
+ r = recurse_dir_at(
+ AT_FDCWD,
+ fs,
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_ENSURE_TYPE,
+ trim_cb,
+ NULL);
+ if (r == -ENOENT) /* non-existing is the ultimate trimming, hence no error */
+ r = 0;
+ else if (r < 0)
+ log_debug_errno(r, "Failed to iterate through cgroup %s: %m", path);
+
+ /* If we shall delete the top-level cgroup, then propagate the failure to do so (except if it is
+ * already gone anyway). Also, let's debug log about this failure, except if the error code is an
+ * expected one. */
+ if (delete_root && !empty_or_root(path) &&
+ rmdir(fs) < 0 && errno != ENOENT) {
+ if (!IN_SET(errno, ENOTEMPTY, EBUSY))
+ log_debug_errno(errno, "Failed to trim cgroup %s: %m", path);
+ if (r >= 0)
+ r = -errno;
+ }
+
+ q = cg_hybrid_unified();
+ if (q < 0)
+ return q;
+ if (q > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER))
+ (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, delete_root);
+
+ return r;
+}
+
+/* Create a cgroup in the hierarchy of controller.
+ * Returns 0 if the group already existed, 1 on success, negative otherwise.
+ */
+int cg_create(const char *controller, const char *path) {
+ _cleanup_free_ char *fs = NULL;
+ int r;
+
+ r = cg_get_path_and_check(controller, path, NULL, &fs);
+ if (r < 0)
+ return r;
+
+ r = mkdir_parents(fs, 0755);
+ if (r < 0)
+ return r;
+
+ r = RET_NERRNO(mkdir(fs, 0755));
+ if (r == -EEXIST)
+ return 0;
+ if (r < 0)
+ return r;
+
+ r = cg_hybrid_unified();
+ if (r < 0)
+ return r;
+
+ if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+ r = cg_create(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
+ if (r < 0)
+ log_warning_errno(r, "Failed to create compat systemd cgroup %s: %m", path);
+ }
+
+ return 1;
+}
+
+int cg_create_and_attach(const char *controller, const char *path, pid_t pid) {
+ int r, q;
+
+ assert(pid >= 0);
+
+ r = cg_create(controller, path);
+ if (r < 0)
+ return r;
+
+ q = cg_attach(controller, path, pid);
+ if (q < 0)
+ return q;
+
+ /* This does not remove the cgroup on failure */
+ return r;
+}
+
+int cg_attach(const char *controller, const char *path, pid_t pid) {
+ _cleanup_free_ char *fs = NULL;
+ char c[DECIMAL_STR_MAX(pid_t) + 2];
+ int r;
+
+ assert(path);
+ assert(pid >= 0);
+
+ r = cg_get_path_and_check(controller, path, "cgroup.procs", &fs);
+ if (r < 0)
+ return r;
+
+ if (pid == 0)
+ pid = getpid_cached();
+
+ xsprintf(c, PID_FMT "\n", pid);
+
+ r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r == -EOPNOTSUPP && cg_is_threaded(path) > 0)
+ /* When the threaded mode is used, we cannot read/write the file. Let's return recognizable error. */
+ return -EUCLEAN;
+ if (r < 0)
+ return r;
+
+ r = cg_hybrid_unified();
+ if (r < 0)
+ return r;
+
+ if (r > 0 && streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+ r = cg_attach(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, pid);
+ if (r < 0)
+ log_warning_errno(r, "Failed to attach "PID_FMT" to compat systemd cgroup %s: %m", pid, path);
+ }
+
+ return 0;
+}
+
+int cg_attach_fallback(const char *controller, const char *path, pid_t pid) {
+ int r;
+
+ assert(controller);
+ assert(path);
+ assert(pid >= 0);
+
+ r = cg_attach(controller, path, pid);
+ if (r < 0) {
+ char prefix[strlen(path) + 1];
+
+ /* This didn't work? Then let's try all prefixes of
+ * the destination */
+
+ PATH_FOREACH_PREFIX(prefix, path) {
+ int q;
+
+ q = cg_attach(controller, prefix, pid);
+ if (q >= 0)
+ return q;
+ }
+ }
+
+ return r;
+}
+
+int cg_set_access(
+ const char *controller,
+ const char *path,
+ uid_t uid,
+ gid_t gid) {
+
+ struct Attribute {
+ const char *name;
+ bool fatal;
+ };
+
+ /* cgroup v1, aka legacy/non-unified */
+ static const struct Attribute legacy_attributes[] = {
+ { "cgroup.procs", true },
+ { "tasks", false },
+ { "cgroup.clone_children", false },
+ {},
+ };
+
+ /* cgroup v2, aka unified */
+ static const struct Attribute unified_attributes[] = {
+ { "cgroup.procs", true },
+ { "cgroup.subtree_control", true },
+ { "cgroup.threads", false },
+ { "memory.oom.group", false },
+ { "memory.reclaim", false },
+ {},
+ };
+
+ static const struct Attribute* const attributes[] = {
+ [false] = legacy_attributes,
+ [true] = unified_attributes,
+ };
+
+ _cleanup_free_ char *fs = NULL;
+ const struct Attribute *i;
+ int r, unified;
+
+ assert(path);
+
+ if (uid == UID_INVALID && gid == GID_INVALID)
+ return 0;
+
+ unified = cg_unified_controller(controller);
+ if (unified < 0)
+ return unified;
+
+ /* Configure access to the cgroup itself */
+ r = cg_get_path(controller, path, NULL, &fs);
+ if (r < 0)
+ return r;
+
+ r = chmod_and_chown(fs, 0755, uid, gid);
+ if (r < 0)
+ return r;
+
+ /* Configure access to the cgroup's attributes */
+ for (i = attributes[unified]; i->name; i++) {
+ fs = mfree(fs);
+
+ r = cg_get_path(controller, path, i->name, &fs);
+ if (r < 0)
+ return r;
+
+ r = chmod_and_chown(fs, 0644, uid, gid);
+ if (r < 0) {
+ if (i->fatal)
+ return r;
+
+ log_debug_errno(r, "Failed to set access on cgroup %s, ignoring: %m", fs);
+ }
+ }
+
+ if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
+ r = cg_hybrid_unified();
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ /* Always propagate access mode from unified to legacy controller */
+ r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, uid, gid);
+ if (r < 0)
+ log_debug_errno(r, "Failed to set access on compatibility systemd cgroup %s, ignoring: %m", path);
+ }
+ }
+
+ return 0;
+}
+
+struct access_callback_data {
+ uid_t uid;
+ gid_t gid;
+ int error;
+};
+
+static int access_callback(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ struct access_callback_data *d = ASSERT_PTR(userdata);
+
+ if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY))
+ return RECURSE_DIR_CONTINUE;
+
+ assert(inode_fd >= 0);
+
+ /* fchown() doesn't support O_PATH fds, hence we use the /proc/self/fd/ trick */
+ if (chown(FORMAT_PROC_FD_PATH(inode_fd), d->uid, d->gid) < 0) {
+ log_debug_errno(errno, "Failed to change ownership of '%s', ignoring: %m", ASSERT_PTR(path));
+
+ if (d->error == 0) /* Return last error to caller */
+ d->error = errno;
+ }
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+int cg_set_access_recursive(
+ const char *controller,
+ const char *path,
+ uid_t uid,
+ gid_t gid) {
+
+ _cleanup_close_ int fd = -EBADF;
+ _cleanup_free_ char *fs = NULL;
+ int r;
+
+ /* A recursive version of cg_set_access(). But note that this one changes ownership of *all* files,
+ * not just the allowlist that cg_set_access() uses. Use cg_set_access() on the cgroup you want to
+ * delegate, and cg_set_access_recursive() for any subcrgoups you might want to create below it. */
+
+ if (!uid_is_valid(uid) && !gid_is_valid(gid))
+ return 0;
+
+ r = cg_get_path(controller, path, NULL, &fs);
+ if (r < 0)
+ return r;
+
+ fd = open(fs, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
+ if (fd < 0)
+ return -errno;
+
+ struct access_callback_data d = {
+ .uid = uid,
+ .gid = gid,
+ };
+
+ r = recurse_dir(fd,
+ fs,
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_SAME_MOUNT|RECURSE_DIR_INODE_FD|RECURSE_DIR_TOPLEVEL,
+ access_callback,
+ &d);
+ if (r < 0)
+ return r;
+
+ return -d.error;
+}
+
+int cg_migrate(
+ const char *cfrom,
+ const char *pfrom,
+ const char *cto,
+ const char *pto,
+ CGroupFlags flags) {
+
+ bool done = false;
+ _cleanup_set_free_ Set *s = NULL;
+ int r, ret = 0;
+ pid_t my_pid;
+
+ assert(cfrom);
+ assert(pfrom);
+ assert(cto);
+ assert(pto);
+
+ s = set_new(NULL);
+ if (!s)
+ return -ENOMEM;
+
+ my_pid = getpid_cached();
+
+ do {
+ _cleanup_fclose_ FILE *f = NULL;
+ pid_t pid = 0;
+ done = true;
+
+ r = cg_enumerate_processes(cfrom, pfrom, &f);
+ if (r < 0) {
+ if (ret >= 0 && r != -ENOENT)
+ return r;
+
+ return ret;
+ }
+
+ while ((r = cg_read_pid(f, &pid)) > 0) {
+
+ /* This might do weird stuff if we aren't a
+ * single-threaded program. However, we
+ * luckily know we are not */
+ if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
+ continue;
+
+ if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
+ continue;
+
+ /* Ignore kernel threads. Since they can only
+ * exist in the root cgroup, we only check for
+ * them there. */
+ if (cfrom &&
+ empty_or_root(pfrom) &&
+ pid_is_kernel_thread(pid) > 0)
+ continue;
+
+ r = cg_attach(cto, pto, pid);
+ if (r < 0) {
+ if (ret >= 0 && r != -ESRCH)
+ ret = r;
+ } else if (ret == 0)
+ ret = 1;
+
+ done = false;
+
+ r = set_put(s, PID_TO_PTR(pid));
+ if (r < 0) {
+ if (ret >= 0)
+ return r;
+
+ return ret;
+ }
+ }
+
+ if (r < 0) {
+ if (ret >= 0)
+ return r;
+
+ return ret;
+ }
+ } while (!done);
+
+ return ret;
+}
+
+int cg_migrate_recursive(
+ const char *cfrom,
+ const char *pfrom,
+ const char *cto,
+ const char *pto,
+ CGroupFlags flags) {
+
+ _cleanup_closedir_ DIR *d = NULL;
+ int r, ret = 0;
+ char *fn;
+
+ assert(cfrom);
+ assert(pfrom);
+ assert(cto);
+ assert(pto);
+
+ ret = cg_migrate(cfrom, pfrom, cto, pto, flags);
+
+ r = cg_enumerate_subgroups(cfrom, pfrom, &d);
+ if (r < 0) {
+ if (ret >= 0 && r != -ENOENT)
+ return r;
+
+ return ret;
+ }
+
+ while ((r = cg_read_subgroup(d, &fn)) > 0) {
+ _cleanup_free_ char *p = NULL;
+
+ p = path_join(empty_to_root(pfrom), fn);
+ free(fn);
+ if (!p)
+ return -ENOMEM;
+
+ r = cg_migrate_recursive(cfrom, p, cto, pto, flags);
+ if (r != 0 && ret >= 0)
+ ret = r;
+ }
+
+ if (r < 0 && ret >= 0)
+ ret = r;
+
+ if (flags & CGROUP_REMOVE) {
+ r = cg_rmdir(cfrom, pfrom);
+ if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
+ return r;
+ }
+
+ return ret;
+}
+
+int cg_migrate_recursive_fallback(
+ const char *cfrom,
+ const char *pfrom,
+ const char *cto,
+ const char *pto,
+ CGroupFlags flags) {
+
+ int r;
+
+ assert(cfrom);
+ assert(pfrom);
+ assert(cto);
+ assert(pto);
+
+ r = cg_migrate_recursive(cfrom, pfrom, cto, pto, flags);
+ if (r < 0) {
+ char prefix[strlen(pto) + 1];
+
+ /* This didn't work? Then let's try all prefixes of the destination */
+
+ PATH_FOREACH_PREFIX(prefix, pto) {
+ int q;
+
+ q = cg_migrate_recursive(cfrom, pfrom, cto, prefix, flags);
+ if (q >= 0)
+ return q;
+ }
+ }
+
+ return r;
+}
+
+int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
+ CGroupController c;
+ CGroupMask done;
+ bool created;
+ int r;
+
+ /* This one will create a cgroup in our private tree, but also
+ * duplicate it in the trees specified in mask, and remove it
+ * in all others.
+ *
+ * Returns 0 if the group already existed in the systemd hierarchy,
+ * 1 on success, negative otherwise.
+ */
+
+ /* First create the cgroup in our own hierarchy. */
+ r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
+ if (r < 0)
+ return r;
+ created = r;
+
+ /* If we are in the unified hierarchy, we are done now */
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return created;
+
+ supported &= CGROUP_MASK_V1;
+ mask = CGROUP_MASK_EXTEND_JOINED(mask);
+ done = 0;
+
+ /* Otherwise, do the same in the other hierarchies */
+ for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+ const char *n;
+
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
+ continue;
+
+ n = cgroup_controller_to_string(c);
+ if (FLAGS_SET(mask, bit))
+ (void) cg_create(n, path);
+
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
+ }
+
+ return created;
+}
+
+int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
+ int r;
+
+ r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
+ if (r < 0)
+ return r;
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 0;
+
+ supported &= CGROUP_MASK_V1;
+ CGroupMask done = 0;
+
+ for (CGroupController c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+ const char *p = NULL;
+
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
+ continue;
+
+ if (path_callback)
+ p = path_callback(bit, userdata);
+ if (!p)
+ p = path;
+
+ (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
+ }
+
+ return 0;
+}
+
+int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata) {
+ CGroupController c;
+ CGroupMask done;
+ int r = 0, q;
+
+ assert(to_callback);
+
+ supported &= CGROUP_MASK_V1;
+ mask = CGROUP_MASK_EXTEND_JOINED(mask);
+ done = 0;
+
+ for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+ const char *to = NULL;
+
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
+ continue;
+
+ if (!FLAGS_SET(mask, bit))
+ continue;
+
+ to = to_callback(bit, userdata);
+
+ /* Remember first error and try continuing */
+ q = cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, from, cgroup_controller_to_string(c), to, 0);
+ r = (r < 0) ? r : q;
+
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
+ }
+
+ return r;
+}
+
+int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
+ int r, q;
+
+ r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
+ if (r < 0)
+ return r;
+
+ q = cg_all_unified();
+ if (q < 0)
+ return q;
+ if (q > 0)
+ return r;
+
+ return cg_trim_v1_controllers(supported, _CGROUP_MASK_ALL, path, delete_root);
+}
+
+int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root) {
+ CGroupController c;
+ CGroupMask done;
+ int r = 0, q;
+
+ supported &= CGROUP_MASK_V1;
+ mask = CGROUP_MASK_EXTEND_JOINED(mask);
+ done = 0;
+
+ for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
+ continue;
+
+ if (FLAGS_SET(mask, bit)) {
+ /* Remember first error and try continuing */
+ q = cg_trim(cgroup_controller_to_string(c), path, delete_root);
+ r = (r < 0) ? r : q;
+ }
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
+ }
+
+ return r;
+}
+
+int cg_enable_everywhere(
+ CGroupMask supported,
+ CGroupMask mask,
+ const char *p,
+ CGroupMask *ret_result_mask) {
+
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *fs = NULL;
+ CGroupController c;
+ CGroupMask ret = 0;
+ int r;
+
+ assert(p);
+
+ if (supported == 0) {
+ if (ret_result_mask)
+ *ret_result_mask = 0;
+ return 0;
+ }
+
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* On the legacy hierarchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
+ * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
+ * caller tends to use the returned mask later on to compare if all controllers where properly joined,
+ * and if not requeues realization. This use is the primary purpose of the return value, hence let's
+ * minimize surprises here and reduce triggers for re-realization by always saying we fully
+ * succeeded.) */
+ if (ret_result_mask)
+ *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
+ * CGROUP_MASK_V2: The 'supported' mask
+ * might contain pure-V1 or BPF
+ * controllers, and we never want to
+ * claim that we could enable those with
+ * cgroup.subtree_control */
+ return 0;
+ }
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
+ if (r < 0)
+ return r;
+
+ for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
+ const char *n;
+
+ if (!FLAGS_SET(CGROUP_MASK_V2, bit))
+ continue;
+
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ n = cgroup_controller_to_string(c);
+ {
+ char s[1 + strlen(n) + 1];
+
+ s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
+ strcpy(s + 1, n);
+
+ if (!f) {
+ f = fopen(fs, "we");
+ if (!f)
+ return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
+ }
+
+ r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
+ FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
+ clearerr(f);
+
+ /* If we can't turn off a controller, leave it on in the reported resulting mask. This
+ * happens for example when we attempt to turn off a controller up in the tree that is
+ * used down in the tree. */
+ if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
+ * only here, and not follow the same logic
+ * for other errors such as EINVAL or
+ * EOPNOTSUPP or anything else. That's
+ * because EBUSY indicates that the
+ * controllers is currently enabled and
+ * cannot be disabled because something down
+ * the hierarchy is still using it. Any other
+ * error most likely means something like "I
+ * never heard of this controller" or
+ * similar. In the former case it's hence
+ * safe to assume the controller is still on
+ * after the failed operation, while in the
+ * latter case it's safer to assume the
+ * controller is unknown and hence certainly
+ * not enabled. */
+ ret |= bit;
+ } else {
+ /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
+ if (FLAGS_SET(mask, bit))
+ ret |= bit;
+ }
+ }
+ }
+
+ /* Let's return the precise set of controllers now enabled for the cgroup. */
+ if (ret_result_mask)
+ *ret_result_mask = ret;
+
+ return 0;
+}
diff --git a/src/shared/cgroup-setup.h b/src/shared/cgroup-setup.h
new file mode 100644
index 0000000..1b6f071
--- /dev/null
+++ b/src/shared/cgroup-setup.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "cgroup-util.h"
+
+bool cg_is_unified_wanted(void);
+bool cg_is_legacy_wanted(void);
+bool cg_is_hybrid_wanted(void);
+
+int cg_weight_parse(const char *s, uint64_t *ret);
+int cg_cpu_weight_parse(const char *s, uint64_t *ret);
+int cg_cpu_shares_parse(const char *s, uint64_t *ret);
+int cg_blkio_weight_parse(const char *s, uint64_t *ret);
+
+int cg_trim(const char *controller, const char *path, bool delete_root);
+
+int cg_create(const char *controller, const char *path);
+int cg_attach(const char *controller, const char *path, pid_t pid);
+int cg_attach_fallback(const char *controller, const char *path, pid_t pid);
+int cg_create_and_attach(const char *controller, const char *path, pid_t pid);
+
+int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
+int cg_set_access_recursive(const char *controller, const char *path, uid_t uid, gid_t gid);
+
+int cg_migrate(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags);
+int cg_migrate_recursive(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags);
+int cg_migrate_recursive_fallback(const char *cfrom, const char *pfrom, const char *cto, const char *pto, CGroupFlags flags);
+
+int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path);
+int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t callback, void *userdata);
+int cg_migrate_v1_controllers(CGroupMask supported, CGroupMask mask, const char *from, cg_migrate_callback_t to_callback, void *userdata);
+int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root);
+int cg_trim_v1_controllers(CGroupMask supported, CGroupMask mask, const char *path, bool delete_root);
+int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p, CGroupMask *ret_result_mask);
diff --git a/src/shared/cgroup-show.c b/src/shared/cgroup-show.c
new file mode 100644
index 0000000..c2ee1c5
--- /dev/null
+++ b/src/shared/cgroup-show.c
@@ -0,0 +1,471 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <dirent.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-util.h"
+#include "cgroup-show.h"
+#include "cgroup-util.h"
+#include "env-file.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "hostname-util.h"
+#include "locale-util.h"
+#include "macro.h"
+#include "nulstr-util.h"
+#include "output-mode.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "sort-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "unit-name.h"
+#include "xattr-util.h"
+
+static void show_pid_array(
+ pid_t pids[],
+ size_t n_pids,
+ const char *prefix,
+ size_t n_columns,
+ bool extra,
+ bool more,
+ OutputFlags flags) {
+
+ size_t i, j, pid_width;
+
+ if (n_pids == 0)
+ return;
+
+ typesafe_qsort(pids, n_pids, pid_compare_func);
+
+ /* Filter duplicates */
+ for (j = 0, i = 1; i < n_pids; i++) {
+ if (pids[i] == pids[j])
+ continue;
+ pids[++j] = pids[i];
+ }
+ n_pids = j + 1;
+ pid_width = DECIMAL_STR_WIDTH(pids[j]);
+
+ if (flags & OUTPUT_FULL_WIDTH)
+ n_columns = SIZE_MAX;
+ else {
+ if (n_columns > pid_width + 3) /* something like "├─1114784 " */
+ n_columns -= pid_width + 3;
+ else
+ n_columns = 20;
+ }
+ for (i = 0; i < n_pids; i++) {
+ _cleanup_free_ char *t = NULL;
+
+ (void) pid_get_cmdline(pids[i], n_columns,
+ PROCESS_CMDLINE_COMM_FALLBACK | PROCESS_CMDLINE_USE_LOCALE,
+ &t);
+
+ if (extra)
+ printf("%s%s ", prefix, special_glyph(SPECIAL_GLYPH_TRIANGULAR_BULLET));
+ else
+ printf("%s%s", prefix, special_glyph(((more || i < n_pids-1) ? SPECIAL_GLYPH_TREE_BRANCH : SPECIAL_GLYPH_TREE_RIGHT)));
+
+ printf("%s%*"PID_PRI" %s%s\n", ansi_grey(), (int) pid_width, pids[i], strna(t), ansi_normal());
+ }
+}
+
+static int show_cgroup_one_by_path(
+ const char *path,
+ const char *prefix,
+ size_t n_columns,
+ bool more,
+ OutputFlags flags) {
+
+ _cleanup_free_ pid_t *pids = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *p = NULL;
+ size_t n = 0;
+ char *fn;
+ int r;
+
+ r = cg_mangle_path(path, &p);
+ if (r < 0)
+ return r;
+
+ fn = strjoina(p, "/cgroup.procs");
+ f = fopen(fn, "re");
+ if (!f)
+ return -errno;
+
+ for (;;) {
+ pid_t pid;
+
+ /* libvirt / qemu uses threaded mode and cgroup.procs cannot be read at the lower levels.
+ * From https://docs.kernel.org/admin-guide/cgroup-v2.html#threads,
+ * “cgroup.procs” in a threaded domain cgroup contains the PIDs of all processes in
+ * the subtree and is not readable in the subtree proper. */
+ r = cg_read_pid(f, &pid);
+ if (IN_SET(r, 0, -EOPNOTSUPP))
+ break;
+ if (r < 0)
+ return r;
+
+ if (!(flags & OUTPUT_KERNEL_THREADS) && pid_is_kernel_thread(pid) > 0)
+ continue;
+
+ if (!GREEDY_REALLOC(pids, n + 1))
+ return -ENOMEM;
+
+ pids[n++] = pid;
+ }
+
+ show_pid_array(pids, n, prefix, n_columns, false, more, flags);
+
+ return 0;
+}
+
+static int show_cgroup_name(
+ const char *path,
+ const char *prefix,
+ SpecialGlyph glyph,
+ OutputFlags flags) {
+
+ uint64_t cgroupid = UINT64_MAX;
+ _cleanup_free_ char *b = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ bool delegate;
+ int r;
+
+ fd = open(path, O_PATH|O_CLOEXEC|O_NOFOLLOW|O_DIRECTORY, 0);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to open cgroup '%s', ignoring: %m", path);
+
+ r = cg_is_delegated_fd(fd);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check if cgroup is delegated, ignoring: %m");
+ delegate = r > 0;
+
+ if (FLAGS_SET(flags, OUTPUT_CGROUP_ID)) {
+ cg_file_handle fh = CG_FILE_HANDLE_INIT;
+ int mnt_id = -1;
+
+ if (name_to_handle_at(
+ fd,
+ "",
+ &fh.file_handle,
+ &mnt_id,
+ AT_EMPTY_PATH) < 0)
+ log_debug_errno(errno, "Failed to determine cgroup ID of %s, ignoring: %m", path);
+ else
+ cgroupid = CG_FILE_HANDLE_CGROUPID(fh);
+ }
+
+ r = path_extract_filename(path, &b);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract filename from cgroup path: %m");
+
+ printf("%s%s%s%s%s",
+ prefix, special_glyph(glyph),
+ delegate ? ansi_underline() : "",
+ cg_unescape(b),
+ delegate ? ansi_normal() : "");
+
+ if (delegate)
+ printf(" %s%s%s",
+ ansi_highlight(),
+ special_glyph(SPECIAL_GLYPH_ELLIPSIS),
+ ansi_normal());
+
+ if (cgroupid != UINT64_MAX)
+ printf(" %s(#%" PRIu64 ")%s", ansi_grey(), cgroupid, ansi_normal());
+
+ printf("\n");
+
+ if (FLAGS_SET(flags, OUTPUT_CGROUP_XATTRS)) {
+ _cleanup_free_ char *nl = NULL;
+
+ r = flistxattr_malloc(fd, &nl);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enumerate xattrs on '%s', ignoring: %m", path);
+
+ NULSTR_FOREACH(xa, nl) {
+ _cleanup_free_ char *x = NULL, *y = NULL, *buf = NULL;
+ int n;
+
+ if (!STARTSWITH_SET(xa, "user.", "trusted."))
+ continue;
+
+ n = fgetxattr_malloc(fd, xa, &buf);
+ if (n < 0) {
+ log_debug_errno(r, "Failed to read xattr '%s' off '%s', ignoring: %m", xa, path);
+ continue;
+ }
+
+ x = cescape(xa);
+ if (!x)
+ return -ENOMEM;
+
+ y = cescape_length(buf, n);
+ if (!y)
+ return -ENOMEM;
+
+ printf("%s%s%s %s%s%s: %s\n",
+ prefix,
+ glyph == SPECIAL_GLYPH_TREE_BRANCH ? special_glyph(SPECIAL_GLYPH_TREE_VERTICAL) : " ",
+ special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+ ansi_blue(), x, ansi_normal(),
+ y);
+ }
+ }
+
+ return 0;
+}
+
+int show_cgroup_by_path(
+ const char *path,
+ const char *prefix,
+ size_t n_columns,
+ OutputFlags flags) {
+
+ _cleanup_free_ char *fn = NULL, *p1 = NULL, *last = NULL, *p2 = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ bool shown_pids = false;
+ char *gn = NULL;
+ int r;
+
+ assert(path);
+
+ if (n_columns <= 0)
+ n_columns = columns();
+
+ prefix = strempty(prefix);
+
+ r = cg_mangle_path(path, &fn);
+ if (r < 0)
+ return r;
+
+ d = opendir(fn);
+ if (!d)
+ return -errno;
+
+ while ((r = cg_read_subgroup(d, &gn)) > 0) {
+ _cleanup_free_ char *k = NULL;
+
+ k = path_join(fn, gn);
+ free(gn);
+ if (!k)
+ return -ENOMEM;
+
+ if (!(flags & OUTPUT_SHOW_ALL) && cg_is_empty_recursive(NULL, k) > 0)
+ continue;
+
+ if (!shown_pids) {
+ (void) show_cgroup_one_by_path(path, prefix, n_columns, true, flags);
+ shown_pids = true;
+ }
+
+ if (last) {
+ r = show_cgroup_name(last, prefix, SPECIAL_GLYPH_TREE_BRANCH, flags);
+ if (r < 0)
+ return r;
+
+ if (!p1) {
+ p1 = strjoin(prefix, special_glyph(SPECIAL_GLYPH_TREE_VERTICAL));
+ if (!p1)
+ return -ENOMEM;
+ }
+
+ show_cgroup_by_path(last, p1, n_columns-2, flags);
+ free(last);
+ }
+
+ last = TAKE_PTR(k);
+ }
+
+ if (r < 0)
+ return r;
+
+ if (!shown_pids)
+ (void) show_cgroup_one_by_path(path, prefix, n_columns, !!last, flags);
+
+ if (last) {
+ r = show_cgroup_name(last, prefix, SPECIAL_GLYPH_TREE_RIGHT, flags);
+ if (r < 0)
+ return r;
+
+ if (!p2) {
+ p2 = strjoin(prefix, " ");
+ if (!p2)
+ return -ENOMEM;
+ }
+
+ show_cgroup_by_path(last, p2, n_columns-2, flags);
+ }
+
+ return 0;
+}
+
+int show_cgroup(const char *controller,
+ const char *path,
+ const char *prefix,
+ size_t n_columns,
+ OutputFlags flags) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(path);
+
+ r = cg_get_path(controller, path, NULL, &p);
+ if (r < 0)
+ return r;
+
+ return show_cgroup_by_path(p, prefix, n_columns, flags);
+}
+
+static int show_extra_pids(
+ const char *controller,
+ const char *path,
+ const char *prefix,
+ size_t n_columns,
+ const pid_t pids[],
+ size_t n_pids,
+ OutputFlags flags) {
+
+ _cleanup_free_ pid_t *copy = NULL;
+ size_t i, j;
+ int r;
+
+ assert(path);
+
+ if (n_pids <= 0)
+ return 0;
+
+ if (n_columns <= 0)
+ n_columns = columns();
+
+ prefix = strempty(prefix);
+
+ copy = new(pid_t, n_pids);
+ if (!copy)
+ return -ENOMEM;
+
+ for (i = 0, j = 0; i < n_pids; i++) {
+ _cleanup_free_ char *k = NULL;
+
+ r = cg_pid_get_path(controller, pids[i], &k);
+ if (r < 0)
+ return r;
+
+ if (path_startswith(k, path))
+ continue;
+
+ copy[j++] = pids[i];
+ }
+
+ show_pid_array(copy, j, prefix, n_columns, true, false, flags);
+
+ return 0;
+}
+
+int show_cgroup_and_extra(
+ const char *controller,
+ const char *path,
+ const char *prefix,
+ size_t n_columns,
+ const pid_t extra_pids[],
+ size_t n_extra_pids,
+ OutputFlags flags) {
+
+ int r;
+
+ assert(path);
+
+ r = show_cgroup(controller, path, prefix, n_columns, flags);
+ if (r < 0)
+ return r;
+
+ return show_extra_pids(controller, path, prefix, n_columns, extra_pids, n_extra_pids, flags);
+}
+
+int show_cgroup_get_unit_path_and_warn(
+ sd_bus *bus,
+ const char *unit,
+ char **ret) {
+
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ path = unit_dbus_path_from_name(unit);
+ if (!path)
+ return log_oom();
+
+ r = sd_bus_get_property_string(
+ bus,
+ "org.freedesktop.systemd1",
+ path,
+ unit_dbus_interface_from_name(unit),
+ "ControlGroup",
+ &error,
+ ret);
+ if (r < 0)
+ return log_error_errno(r, "Failed to query unit control group path: %s",
+ bus_error_message(&error, r));
+
+ return 0;
+}
+
+int show_cgroup_get_path_and_warn(
+ const char *machine,
+ const char *prefix,
+ char **ret) {
+
+ _cleanup_free_ char *root = NULL;
+ int r;
+
+ if (machine) {
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_free_ char *unit = NULL;
+ const char *m;
+
+ if (!hostname_is_valid(machine, 0))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Machine name is not valid: %s", machine);
+
+ m = strjoina("/run/systemd/machines/", machine);
+ r = parse_env_file(NULL, m, "SCOPE", &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load machine data: %m");
+
+ r = bus_connect_transport_systemd(BUS_TRANSPORT_LOCAL, NULL, RUNTIME_SCOPE_SYSTEM, &bus);
+ if (r < 0)
+ return bus_log_connect_error(r, BUS_TRANSPORT_LOCAL);
+
+ r = show_cgroup_get_unit_path_and_warn(bus, unit, &root);
+ if (r < 0)
+ return r;
+ } else {
+ r = cg_get_root_path(&root);
+ if (r == -ENOMEDIUM)
+ return log_error_errno(r, "Failed to get root control group path.\n"
+ "No cgroup filesystem mounted on /sys/fs/cgroup");
+ if (r < 0)
+ return log_error_errno(r, "Failed to get root control group path: %m");
+ }
+
+ if (prefix) {
+ char *t;
+
+ t = path_join(root, prefix);
+ if (!t)
+ return log_oom();
+
+ *ret = t;
+ } else
+ *ret = TAKE_PTR(root);
+
+ return 0;
+}
diff --git a/src/shared/cgroup-show.h b/src/shared/cgroup-show.h
new file mode 100644
index 0000000..db3c9c9
--- /dev/null
+++ b/src/shared/cgroup-show.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "sd-bus.h"
+
+#include "logs-show.h"
+#include "output-mode.h"
+
+int show_cgroup_by_path(const char *path, const char *prefix, size_t n_columns, OutputFlags flags);
+int show_cgroup(const char *controller, const char *path, const char *prefix, size_t n_columns, OutputFlags flags);
+
+int show_cgroup_and_extra(const char *controller, const char *path, const char *prefix, size_t n_columns, const pid_t extra_pids[], size_t n_extra_pids, OutputFlags flags);
+
+int show_cgroup_get_unit_path_and_warn(
+ sd_bus *bus,
+ const char *unit,
+ char **ret);
+int show_cgroup_get_path_and_warn(
+ const char *machine,
+ const char *prefix,
+ char **ret);
diff --git a/src/shared/chown-recursive.c b/src/shared/chown-recursive.c
new file mode 100644
index 0000000..6aa5f67
--- /dev/null
+++ b/src/shared/chown-recursive.c
@@ -0,0 +1,177 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/xattr.h>
+
+#include "chown-recursive.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "macro.h"
+#include "stdio-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+static int chown_one(
+ int fd,
+ const struct stat *st,
+ uid_t uid,
+ gid_t gid,
+ mode_t mask) {
+
+ int r;
+
+ assert(fd >= 0);
+ assert(st);
+
+ /* We change ACLs through the /proc/self/fd/%i path, so that we have a stable reference that works
+ * with O_PATH. */
+
+ /* Drop any ACL if there is one */
+ FOREACH_STRING(n, "system.posix_acl_access", "system.posix_acl_default")
+ if (removexattr(FORMAT_PROC_FD_PATH(fd), n) < 0)
+ if (!ERRNO_IS_XATTR_ABSENT(errno))
+ return -errno;
+
+ r = fchmod_and_chown(fd, st->st_mode & mask, uid, gid);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int chown_recursive_internal(
+ int fd,
+ const struct stat *st,
+ uid_t uid,
+ gid_t gid,
+ mode_t mask) {
+
+ _cleanup_closedir_ DIR *d = NULL;
+ bool changed = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(st);
+
+ d = fdopendir(fd);
+ if (!d) {
+ safe_close(fd);
+ return -errno;
+ }
+
+ FOREACH_DIRENT_ALL(de, d, return -errno) {
+ _cleanup_close_ int path_fd = -EBADF;
+ struct stat fst;
+
+ if (dot_or_dot_dot(de->d_name))
+ continue;
+
+ /* Let's pin the child inode we want to fix now with an O_PATH fd, so that it cannot be swapped out
+ * while we manipulate it. */
+ path_fd = openat(dirfd(d), de->d_name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
+ if (path_fd < 0)
+ return -errno;
+
+ if (fstat(path_fd, &fst) < 0)
+ return -errno;
+
+ if (S_ISDIR(fst.st_mode)) {
+ int subdir_fd;
+
+ /* Convert it to a "real" (i.e. non-O_PATH) fd now */
+ subdir_fd = fd_reopen(path_fd, O_RDONLY|O_CLOEXEC|O_NOATIME);
+ if (subdir_fd < 0)
+ return subdir_fd;
+
+ r = chown_recursive_internal(subdir_fd, &fst, uid, gid, mask); /* takes possession of subdir_fd even on failure */
+ if (r < 0)
+ return r;
+ if (r > 0)
+ changed = true;
+ } else {
+ r = chown_one(path_fd, &fst, uid, gid, mask);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ changed = true;
+ }
+ }
+
+ r = chown_one(dirfd(d), st, uid, gid, mask);
+ if (r < 0)
+ return r;
+
+ return r > 0 || changed;
+}
+
+int path_chown_recursive(
+ const char *path,
+ uid_t uid,
+ gid_t gid,
+ mode_t mask,
+ int flags) {
+
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+
+ assert((flags & ~AT_SYMLINK_FOLLOW) == 0);
+
+ fd = open(path, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOATIME|(FLAGS_SET(flags, AT_SYMLINK_FOLLOW) ? 0 : O_NOFOLLOW));
+ if (fd < 0)
+ return -errno;
+
+ if (!uid_is_valid(uid) && !gid_is_valid(gid) && FLAGS_SET(mask, 07777))
+ return 0; /* nothing to do */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* Let's take a shortcut: if the top-level directory is properly owned, we don't descend into the
+ * whole tree, under the assumption that all is OK anyway. */
+ if ((!uid_is_valid(uid) || st.st_uid == uid) &&
+ (!gid_is_valid(gid) || st.st_gid == gid) &&
+ ((st.st_mode & ~mask & 07777) == 0))
+ return 0;
+
+ return chown_recursive_internal(TAKE_FD(fd), &st, uid, gid, mask); /* we donate the fd to the call, regardless if it succeeded or failed */
+}
+
+int fd_chown_recursive(
+ int fd,
+ uid_t uid,
+ gid_t gid,
+ mode_t mask) {
+
+ int duplicated_fd = -EBADF;
+ struct stat st;
+
+ /* Note that the slightly different order of fstat() and the checks here and in
+ * path_chown_recursive(). That's because when we open the directory ourselves we can specify
+ * O_DIRECTORY and we always want to ensure we are operating on a directory before deciding whether
+ * the operation is otherwise redundant. */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISDIR(st.st_mode))
+ return -ENOTDIR;
+
+ if (!uid_is_valid(uid) && !gid_is_valid(gid) && FLAGS_SET(mask, 07777))
+ return 0; /* nothing to do */
+
+ /* Shortcut, as above */
+ if ((!uid_is_valid(uid) || st.st_uid == uid) &&
+ (!gid_is_valid(gid) || st.st_gid == gid) &&
+ ((st.st_mode & ~mask & 07777) == 0))
+ return 0;
+
+ /* Let's duplicate the fd here, as opendir() wants to take possession of it and close it afterwards */
+ duplicated_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (duplicated_fd < 0)
+ return -errno;
+
+ return chown_recursive_internal(duplicated_fd, &st, uid, gid, mask); /* fd donated even on failure */
+}
diff --git a/src/shared/chown-recursive.h b/src/shared/chown-recursive.h
new file mode 100644
index 0000000..2aab8e7
--- /dev/null
+++ b/src/shared/chown-recursive.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+int path_chown_recursive(const char *path, uid_t uid, gid_t gid, mode_t mask, int flags);
+
+int fd_chown_recursive(int fd, uid_t uid, gid_t gid, mode_t mask);
diff --git a/src/shared/clean-ipc.c b/src/shared/clean-ipc.c
new file mode 100644
index 0000000..bbb343f
--- /dev/null
+++ b/src/shared/clean-ipc.c
@@ -0,0 +1,452 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <mqueue.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/msg.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "clean-ipc.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "log.h"
+#include "macro.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+static bool match_uid_gid(uid_t subject_uid, gid_t subject_gid, uid_t delete_uid, gid_t delete_gid) {
+
+ if (uid_is_valid(delete_uid) && subject_uid == delete_uid)
+ return true;
+
+ if (gid_is_valid(delete_gid) && subject_gid == delete_gid)
+ return true;
+
+ return false;
+}
+
+static int clean_sysvipc_shm(uid_t delete_uid, gid_t delete_gid, bool rm) {
+ _cleanup_fclose_ FILE *f = NULL;
+ bool first = true;
+ int ret = 0, r;
+
+ f = fopen("/proc/sysvipc/shm", "re");
+ if (!f) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_warning_errno(errno, "Failed to open /proc/sysvipc/shm: %m");
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ unsigned n_attached;
+ pid_t cpid, lpid;
+ uid_t uid, cuid;
+ gid_t gid, cgid;
+ int shmid;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_warning_errno(errno, "Failed to read /proc/sysvipc/shm: %m");
+ if (r == 0)
+ break;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (sscanf(line, "%*i %i %*o %*u " PID_FMT " " PID_FMT " %u " UID_FMT " " GID_FMT " " UID_FMT " " GID_FMT,
+ &shmid, &cpid, &lpid, &n_attached, &uid, &gid, &cuid, &cgid) != 8)
+ continue;
+
+ if (n_attached > 0)
+ continue;
+
+ if (!match_uid_gid(uid, gid, delete_uid, delete_gid))
+ continue;
+
+ if (!rm)
+ return 1;
+
+ if (shmctl(shmid, IPC_RMID, NULL) < 0) {
+
+ /* Ignore entries that are already deleted */
+ if (IN_SET(errno, EIDRM, EINVAL))
+ continue;
+
+ ret = log_warning_errno(errno,
+ "Failed to remove SysV shared memory segment %i: %m",
+ shmid);
+ } else {
+ log_debug("Removed SysV shared memory segment %i.", shmid);
+ if (ret == 0)
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+static int clean_sysvipc_sem(uid_t delete_uid, gid_t delete_gid, bool rm) {
+ _cleanup_fclose_ FILE *f = NULL;
+ bool first = true;
+ int ret = 0, r;
+
+ f = fopen("/proc/sysvipc/sem", "re");
+ if (!f) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_warning_errno(errno, "Failed to open /proc/sysvipc/sem: %m");
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ uid_t uid, cuid;
+ gid_t gid, cgid;
+ int semid;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read /proc/sysvipc/sem: %m");
+ if (r == 0)
+ break;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (sscanf(line, "%*i %i %*o %*u " UID_FMT " " GID_FMT " " UID_FMT " " GID_FMT,
+ &semid, &uid, &gid, &cuid, &cgid) != 5)
+ continue;
+
+ if (!match_uid_gid(uid, gid, delete_uid, delete_gid))
+ continue;
+
+ if (!rm)
+ return 1;
+
+ if (semctl(semid, 0, IPC_RMID) < 0) {
+
+ /* Ignore entries that are already deleted */
+ if (IN_SET(errno, EIDRM, EINVAL))
+ continue;
+
+ ret = log_warning_errno(errno,
+ "Failed to remove SysV semaphores object %i: %m",
+ semid);
+ } else {
+ log_debug("Removed SysV semaphore %i.", semid);
+ if (ret == 0)
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+static int clean_sysvipc_msg(uid_t delete_uid, gid_t delete_gid, bool rm) {
+ _cleanup_fclose_ FILE *f = NULL;
+ bool first = true;
+ int ret = 0, r;
+
+ f = fopen("/proc/sysvipc/msg", "re");
+ if (!f) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_warning_errno(errno, "Failed to open /proc/sysvipc/msg: %m");
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ uid_t uid, cuid;
+ gid_t gid, cgid;
+ pid_t cpid, lpid;
+ int msgid;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read /proc/sysvipc/msg: %m");
+ if (r == 0)
+ break;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (sscanf(line, "%*i %i %*o %*u %*u " PID_FMT " " PID_FMT " " UID_FMT " " GID_FMT " " UID_FMT " " GID_FMT,
+ &msgid, &cpid, &lpid, &uid, &gid, &cuid, &cgid) != 7)
+ continue;
+
+ if (!match_uid_gid(uid, gid, delete_uid, delete_gid))
+ continue;
+
+ if (!rm)
+ return 1;
+
+ if (msgctl(msgid, IPC_RMID, NULL) < 0) {
+
+ /* Ignore entries that are already deleted */
+ if (IN_SET(errno, EIDRM, EINVAL))
+ continue;
+
+ ret = log_warning_errno(errno,
+ "Failed to remove SysV message queue %i: %m",
+ msgid);
+ } else {
+ log_debug("Removed SysV message queue %i.", msgid);
+ if (ret == 0)
+ ret = 1;
+ }
+ }
+
+ return ret;
+}
+
+static int clean_posix_shm_internal(const char *dirname, DIR *dir, uid_t uid, gid_t gid, bool rm) {
+ int ret = 0, r;
+
+ assert(dir);
+
+ FOREACH_DIRENT_ALL(de, dir, goto fail) {
+ struct stat st;
+
+ if (dot_or_dot_dot(de->d_name))
+ continue;
+
+ if (fstatat(dirfd(dir), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ ret = log_warning_errno(errno, "Failed to stat() POSIX shared memory segment %s/%s: %m",
+ dirname, de->d_name);
+ continue;
+ }
+
+ if (S_ISDIR(st.st_mode)) {
+ _cleanup_closedir_ DIR *kid = NULL;
+
+ kid = xopendirat(dirfd(dir), de->d_name, O_NOFOLLOW|O_NOATIME);
+ if (!kid) {
+ if (errno != ENOENT)
+ ret = log_warning_errno(errno, "Failed to enter shared memory directory %s/%s: %m",
+ dirname, de->d_name);
+ } else {
+ r = clean_posix_shm_internal(de->d_name, kid, uid, gid, rm);
+ if (r < 0)
+ ret = r;
+ }
+
+ if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid))
+ continue;
+
+ if (!rm)
+ return 1;
+
+ if (unlinkat(dirfd(dir), de->d_name, AT_REMOVEDIR) < 0) {
+
+ if (errno == ENOENT)
+ continue;
+
+ ret = log_warning_errno(errno, "Failed to remove POSIX shared memory directory %s/%s: %m",
+ dirname, de->d_name);
+ } else {
+ log_debug("Removed POSIX shared memory directory %s", de->d_name);
+ if (ret == 0)
+ ret = 1;
+ }
+ } else {
+
+ if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid))
+ continue;
+
+ if (!rm)
+ return 1;
+
+ if (unlinkat(dirfd(dir), de->d_name, 0) < 0) {
+
+ if (errno == ENOENT)
+ continue;
+
+ ret = log_warning_errno(errno, "Failed to remove POSIX shared memory segment %s: %m", de->d_name);
+ } else {
+ log_debug("Removed POSIX shared memory segment %s", de->d_name);
+ if (ret == 0)
+ ret = 1;
+ }
+ }
+ }
+
+ return ret;
+
+fail:
+ return log_warning_errno(errno, "Failed to read /dev/shm: %m");
+}
+
+static int clean_posix_shm(uid_t uid, gid_t gid, bool rm) {
+ _cleanup_closedir_ DIR *dir = NULL;
+
+ dir = opendir("/dev/shm");
+ if (!dir) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_warning_errno(errno, "Failed to open /dev/shm: %m");
+ }
+
+ return clean_posix_shm_internal("/dev/shm", dir, uid, gid, rm);
+}
+
+static int clean_posix_mq(uid_t uid, gid_t gid, bool rm) {
+ _cleanup_closedir_ DIR *dir = NULL;
+ int ret = 0;
+
+ dir = opendir("/dev/mqueue");
+ if (!dir) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_warning_errno(errno, "Failed to open /dev/mqueue: %m");
+ }
+
+ FOREACH_DIRENT_ALL(de, dir, goto fail) {
+ struct stat st;
+ char fn[1+strlen(de->d_name)+1];
+
+ if (dot_or_dot_dot(de->d_name))
+ continue;
+
+ if (fstatat(dirfd(dir), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ ret = log_warning_errno(errno,
+ "Failed to stat() MQ segment %s: %m",
+ de->d_name);
+ continue;
+ }
+
+ if (!match_uid_gid(st.st_uid, st.st_gid, uid, gid))
+ continue;
+
+ if (!rm)
+ return 1;
+
+ fn[0] = '/';
+ strcpy(fn+1, de->d_name);
+
+ if (mq_unlink(fn) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ ret = log_warning_errno(errno,
+ "Failed to unlink POSIX message queue %s: %m",
+ fn);
+ } else {
+ log_debug("Removed POSIX message queue %s", fn);
+ if (ret == 0)
+ ret = 1;
+ }
+ }
+
+ return ret;
+
+fail:
+ return log_warning_errno(errno, "Failed to read /dev/mqueue: %m");
+}
+
+int clean_ipc_internal(uid_t uid, gid_t gid, bool rm) {
+ int ret = 0, r;
+
+ /* If 'rm' is true, clean all IPC objects owned by either the specified UID or the specified GID. Return the
+ * last error encountered or == 0 if no matching IPC objects have been found or > 0 if matching IPC objects
+ * have been found and have been removed.
+ *
+ * If 'rm' is false, just search for IPC objects owned by either the specified UID or the specified GID. In
+ * this case we return < 0 on error, > 0 if we found a matching object, == 0 if we didn't.
+ *
+ * As special rule: if UID/GID is specified as root we'll silently not clean up things, and always claim that
+ * there are IPC objects for it. */
+
+ if (uid == 0) {
+ if (!rm)
+ return 1;
+
+ uid = UID_INVALID;
+ }
+ if (gid == 0) {
+ if (!rm)
+ return 1;
+
+ gid = GID_INVALID;
+ }
+
+ /* Anything to do? */
+ if (!uid_is_valid(uid) && !gid_is_valid(gid))
+ return 0;
+
+ r = clean_sysvipc_shm(uid, gid, rm);
+ if (r != 0) {
+ if (!rm)
+ return r;
+ if (ret == 0)
+ ret = r;
+ }
+
+ r = clean_sysvipc_sem(uid, gid, rm);
+ if (r != 0) {
+ if (!rm)
+ return r;
+ if (ret == 0)
+ ret = r;
+ }
+
+ r = clean_sysvipc_msg(uid, gid, rm);
+ if (r != 0) {
+ if (!rm)
+ return r;
+ if (ret == 0)
+ ret = r;
+ }
+
+ r = clean_posix_shm(uid, gid, rm);
+ if (r != 0) {
+ if (!rm)
+ return r;
+ if (ret == 0)
+ ret = r;
+ }
+
+ r = clean_posix_mq(uid, gid, rm);
+ if (r != 0) {
+ if (!rm)
+ return r;
+ if (ret == 0)
+ ret = r;
+ }
+
+ return ret;
+}
+
+int clean_ipc_by_uid(uid_t uid) {
+ return clean_ipc_internal(uid, GID_INVALID, true);
+}
+
+int clean_ipc_by_gid(gid_t gid) {
+ return clean_ipc_internal(UID_INVALID, gid, true);
+}
diff --git a/src/shared/clean-ipc.h b/src/shared/clean-ipc.h
new file mode 100644
index 0000000..ed348fb
--- /dev/null
+++ b/src/shared/clean-ipc.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "user-util.h"
+
+int clean_ipc_internal(uid_t uid, gid_t gid, bool rm);
+
+/* Remove all IPC objects owned by the specified UID or GID */
+int clean_ipc_by_uid(uid_t uid);
+int clean_ipc_by_gid(gid_t gid);
+
+/* Check if any IPC object owned by the specified UID or GID exists, returns > 0 if so, == 0 if not */
+static inline int search_ipc(uid_t uid, gid_t gid) {
+ return clean_ipc_internal(uid, gid, false);
+}
diff --git a/src/shared/clock-util.c b/src/shared/clock-util.c
new file mode 100644
index 0000000..b0cbe30
--- /dev/null
+++ b/src/shared/clock-util.c
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <time.h>
+#include <linux/rtc.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+
+#include "alloc-util.h"
+#include "clock-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "macro.h"
+#include "string-util.h"
+
+int clock_get_hwclock(struct tm *tm) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(tm);
+
+ fd = open("/dev/rtc", O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ /* This leaves the timezone fields of struct tm
+ * uninitialized! */
+ if (ioctl(fd, RTC_RD_TIME, tm) < 0)
+ return -errno;
+
+ /* We don't know daylight saving, so we reset this in order not
+ * to confuse mktime(). */
+ tm->tm_isdst = -1;
+
+ return 0;
+}
+
+int clock_set_hwclock(const struct tm *tm) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(tm);
+
+ fd = open("/dev/rtc", O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ return RET_NERRNO(ioctl(fd, RTC_SET_TIME, tm));
+}
+
+int clock_is_localtime(const char* adjtime_path) {
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ if (!adjtime_path)
+ adjtime_path = "/etc/adjtime";
+
+ /*
+ * The third line of adjtime is "UTC" or "LOCAL" or nothing.
+ * # /etc/adjtime
+ * 0.0 0 0
+ * 0
+ * UTC
+ */
+ f = fopen(adjtime_path, "re");
+ if (f) {
+ _cleanup_free_ char *line = NULL;
+ unsigned i;
+
+ for (i = 0; i < 2; i++) { /* skip the first two lines */
+ r = read_line(f, LONG_LINE_MAX, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return false; /* less than three lines → default to UTC */
+ }
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return false; /* less than three lines → default to UTC */
+
+ return streq(line, "LOCAL");
+
+ } else if (errno != ENOENT)
+ return -errno;
+
+ /* adjtime not present → default to UTC */
+ return false;
+}
+
+int clock_set_timezone(int *ret_minutesdelta) {
+ struct timespec ts;
+ struct tm tm;
+ int minutesdelta;
+ struct timezone tz;
+
+ assert_se(clock_gettime(CLOCK_REALTIME, &ts) == 0);
+ assert_se(localtime_r(&ts.tv_sec, &tm));
+ minutesdelta = tm.tm_gmtoff / 60;
+
+ tz = (struct timezone) {
+ .tz_minuteswest = -minutesdelta,
+ .tz_dsttime = 0, /* DST_NONE */
+ };
+
+ /* If the RTC does not run in UTC but in local time, the very first call to settimeofday() will set
+ * the kernel's timezone and will warp the system clock, so that it runs in UTC instead of the local
+ * time we have read from the RTC. */
+ if (settimeofday(NULL, &tz) < 0)
+ return -errno;
+
+ if (ret_minutesdelta)
+ *ret_minutesdelta = minutesdelta;
+
+ return 0;
+}
+
+int clock_reset_timewarp(void) {
+ static const struct timezone tz = {
+ .tz_minuteswest = 0,
+ .tz_dsttime = 0, /* DST_NONE */
+ };
+
+ /* The very first call to settimeofday() does time warp magic. Do a dummy call here, so the time
+ * warping is sealed and all later calls behave as expected. */
+ return RET_NERRNO(settimeofday(NULL, &tz));
+}
+
+#define EPOCH_FILE "/usr/lib/clock-epoch"
+
+int clock_apply_epoch(ClockChangeDirection *ret_attempted_change) {
+ usec_t epoch_usec, now_usec;
+ struct stat st;
+
+ /* NB: we update *ret_attempted_change in *all* cases, both
+ * on success and failure, to indicate what we intended to do! */
+
+ assert(ret_attempted_change);
+
+ if (stat(EPOCH_FILE, &st) < 0) {
+ if (errno != ENOENT)
+ log_warning_errno(errno, "Cannot stat " EPOCH_FILE ": %m");
+
+ epoch_usec = (usec_t) TIME_EPOCH * USEC_PER_SEC;
+ } else
+ epoch_usec = timespec_load(&st.st_mtim);
+
+ now_usec = now(CLOCK_REALTIME);
+ if (now_usec < epoch_usec)
+ *ret_attempted_change = CLOCK_CHANGE_FORWARD;
+ else if (CLOCK_VALID_RANGE_USEC_MAX > 0 && now_usec > usec_add(epoch_usec, CLOCK_VALID_RANGE_USEC_MAX))
+ *ret_attempted_change = CLOCK_CHANGE_BACKWARD;
+ else {
+ *ret_attempted_change = CLOCK_CHANGE_NOOP;
+ return 0;
+ }
+
+ if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(epoch_usec)) < 0)
+ return -errno;
+
+ return 1;
+}
diff --git a/src/shared/clock-util.h b/src/shared/clock-util.h
new file mode 100644
index 0000000..c8f6d1b
--- /dev/null
+++ b/src/shared/clock-util.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <time.h>
+
+typedef enum ClockChangeDirection {
+ CLOCK_CHANGE_NOOP,
+ CLOCK_CHANGE_FORWARD,
+ CLOCK_CHANGE_BACKWARD,
+ _CLOCK_CHANGE_MAX,
+ _CLOCK_CHANGE_INVALID = -EINVAL,
+} ClockChangeDirection;
+
+int clock_is_localtime(const char* adjtime_path);
+int clock_set_timezone(int *ret_minutesdelta);
+int clock_reset_timewarp(void);
+int clock_get_hwclock(struct tm *tm);
+int clock_set_hwclock(const struct tm *tm);
+int clock_apply_epoch(ClockChangeDirection *ret_attempted_change);
diff --git a/src/shared/common-signal.c b/src/shared/common-signal.c
new file mode 100644
index 0000000..8e70e36
--- /dev/null
+++ b/src/shared/common-signal.c
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "common-signal.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "memstream-util.h"
+#include "process-util.h"
+#include "signal-util.h"
+
+int sigrtmin18_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+ struct sigrtmin18_info *info = userdata;
+ _cleanup_free_ char *comm = NULL;
+
+ assert(s);
+ assert(si);
+
+ (void) pid_get_comm(si->ssi_pid, &comm);
+
+ if (si->ssi_code != SI_QUEUE) {
+ log_notice("Received control signal %s from process " PID_FMT " (%s) without command value, ignoring.",
+ signal_to_string(si->ssi_signo),
+ (pid_t) si->ssi_pid,
+ strna(comm));
+ return 0;
+ }
+
+ log_debug("Received control signal %s from process " PID_FMT " (%s) with command 0x%08x.",
+ signal_to_string(si->ssi_signo),
+ (pid_t) si->ssi_pid,
+ strna(comm),
+ (unsigned) si->ssi_int);
+
+ switch (si->ssi_int) {
+
+ case _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE..._COMMON_SIGNAL_COMMAND_LOG_LEVEL_END:
+ log_set_max_level(si->ssi_int - _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE);
+ break;
+
+ case COMMON_SIGNAL_COMMAND_CONSOLE:
+ log_set_target_and_open(LOG_TARGET_CONSOLE);
+ break;
+ case COMMON_SIGNAL_COMMAND_JOURNAL:
+ log_set_target_and_open(LOG_TARGET_JOURNAL);
+ break;
+ case COMMON_SIGNAL_COMMAND_KMSG:
+ log_set_target_and_open(LOG_TARGET_KMSG);
+ break;
+ case COMMON_SIGNAL_COMMAND_NULL:
+ log_set_target_and_open(LOG_TARGET_NULL);
+ break;
+
+ case COMMON_SIGNAL_COMMAND_MEMORY_PRESSURE:
+ if (info && info->memory_pressure_handler)
+ return info->memory_pressure_handler(s, info->memory_pressure_userdata);
+
+ sd_event_trim_memory();
+ break;
+
+ case COMMON_SIGNAL_COMMAND_MALLOC_INFO: {
+ _cleanup_(memstream_done) MemStream m = {};
+ FILE *f;
+
+ f = memstream_init(&m);
+ if (!f) {
+ log_oom();
+ break;
+ }
+
+ if (malloc_info(0, f) < 0) {
+ log_error_errno(errno, "Failed to invoke malloc_info(): %m");
+ break;
+ }
+
+ (void) memstream_dump(LOG_INFO, &m);
+ break;
+ }
+
+ default:
+ log_notice("Received control signal %s with unknown command 0x%08x, ignoring.",
+ signal_to_string(si->ssi_signo), (unsigned) si->ssi_int);
+ break;
+ }
+
+ return 0;
+}
diff --git a/src/shared/common-signal.h b/src/shared/common-signal.h
new file mode 100644
index 0000000..1fe7b76
--- /dev/null
+++ b/src/shared/common-signal.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <syslog.h>
+
+#include <sd-event.h>
+
+/* All our long-running services should implement a SIGRTMIN+18 handler that can be used to trigger certain
+ * actions that affect service runtime. The specific action is indicated via the "value integer" you can pass
+ * along realtime signals. This is mostly intended for debugging purposes and is entirely asynchronous in
+ * nature. Specifically, these are the commands:
+ *
+ * Currently available operations:
+ *
+ * • Change maximum log level
+ * • Change log target
+ * • Invoke memory trimming, like under memory pressure
+ * • Write glibc malloc() allocation info to logs
+ *
+ * How to use this? Via a command like the following:
+ *
+ * /usr/bin/kill -s RTMIN+18 -q 768 1
+ *
+ * (This will tell PID 1 to trim its memory use.)
+ *
+ * or:
+ *
+ * systemctl kill --kill-value=0x300 -s RTMIN+18 systemd-journald
+ *
+ * (This will tell journald to trim its memory use.)
+ */
+
+enum {
+ _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE = 0x100,
+ COMMON_SIGNAL_COMMAND_LOG_EMERG = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_EMERG,
+ COMMON_SIGNAL_COMMAND_LOG_ALERT = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_ALERT,
+ COMMON_SIGNAL_COMMAND_LOG_CRIT = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_CRIT,
+ COMMON_SIGNAL_COMMAND_LOG_ERR = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_ERR,
+ COMMON_SIGNAL_COMMAND_LOG_WARNING = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_WARNING,
+ COMMON_SIGNAL_COMMAND_LOG_NOTICE = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_NOTICE,
+ COMMON_SIGNAL_COMMAND_LOG_INFO = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_INFO,
+ COMMON_SIGNAL_COMMAND_LOG_DEBUG = _COMMON_SIGNAL_COMMAND_LOG_LEVEL_BASE + LOG_DEBUG,
+ _COMMON_SIGNAL_COMMAND_LOG_LEVEL_END = COMMON_SIGNAL_COMMAND_LOG_DEBUG,
+
+ COMMON_SIGNAL_COMMAND_CONSOLE = 0x200,
+ COMMON_SIGNAL_COMMAND_JOURNAL,
+ COMMON_SIGNAL_COMMAND_KMSG,
+ COMMON_SIGNAL_COMMAND_NULL,
+
+ COMMON_SIGNAL_COMMAND_MEMORY_PRESSURE = 0x300,
+ COMMON_SIGNAL_COMMAND_MALLOC_INFO,
+
+ /* Private signals start at 0x500 */
+ _COMMON_SIGNAL_COMMAND_PRIVATE_BASE = 0x500,
+ _COMMON_SIGNAL_COMMAND_PRIVATE_END = 0xfff,
+};
+
+struct sigrtmin18_info {
+ sd_event_handler_t memory_pressure_handler;
+ void *memory_pressure_userdata;
+};
+
+int sigrtmin18_handler(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata);
diff --git a/src/shared/compare-operator.c b/src/shared/compare-operator.c
new file mode 100644
index 0000000..0da28fc
--- /dev/null
+++ b/src/shared/compare-operator.c
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fnmatch.h>
+
+#include "compare-operator.h"
+#include "string-util.h"
+
+CompareOperator parse_compare_operator(const char **s, CompareOperatorParseFlags flags) {
+ static const struct {
+ CompareOperator op;
+ const char *str;
+ CompareOperatorParseFlags valid_mask; /* If this operator appears when flags in mask not set, fail */
+ CompareOperatorParseFlags need_mask; /* Skip over this operator when flags in mask not set */
+ } table[] = {
+ { COMPARE_FNMATCH_EQUAL, "$=", .valid_mask = COMPARE_ALLOW_FNMATCH },
+ { COMPARE_FNMATCH_UNEQUAL, "!$=", .valid_mask = COMPARE_ALLOW_FNMATCH },
+
+ { COMPARE_UNEQUAL, "<>" },
+ { COMPARE_LOWER_OR_EQUAL, "<=" },
+ { COMPARE_GREATER_OR_EQUAL, ">=" },
+ { COMPARE_LOWER, "<" },
+ { COMPARE_GREATER, ">" },
+ { COMPARE_EQUAL, "==" },
+ { COMPARE_STRING_EQUAL, "=", .need_mask = COMPARE_EQUAL_BY_STRING },
+ { COMPARE_EQUAL, "=" },
+ { COMPARE_STRING_UNEQUAL, "!=", .need_mask = COMPARE_EQUAL_BY_STRING },
+ { COMPARE_UNEQUAL, "!=" },
+
+ { COMPARE_LOWER, "lt", .valid_mask = COMPARE_ALLOW_TEXTUAL },
+ { COMPARE_LOWER_OR_EQUAL, "le", .valid_mask = COMPARE_ALLOW_TEXTUAL },
+ { COMPARE_EQUAL, "eq", .valid_mask = COMPARE_ALLOW_TEXTUAL },
+ { COMPARE_UNEQUAL, "ne", .valid_mask = COMPARE_ALLOW_TEXTUAL },
+ { COMPARE_GREATER_OR_EQUAL, "ge", .valid_mask = COMPARE_ALLOW_TEXTUAL },
+ { COMPARE_GREATER, "gt", .valid_mask = COMPARE_ALLOW_TEXTUAL },
+ };
+
+ assert(s);
+
+ if (!*s) /* Hmm, we already reached the end, for example because extract_first_word() and
+ * parse_compare_operator() are use on the same string? */
+ return _COMPARE_OPERATOR_INVALID;
+
+ for (size_t i = 0; i < ELEMENTSOF(table); i ++) {
+ const char *e;
+
+ if (table[i].need_mask != 0 && !FLAGS_SET(flags, table[i].need_mask))
+ continue;
+
+ e = startswith(*s, table[i].str);
+ if (e) {
+ if (table[i].valid_mask != 0 && !FLAGS_SET(flags, table[i].valid_mask))
+ return _COMPARE_OPERATOR_INVALID;
+
+ *s = e;
+ return table[i].op;
+ }
+ }
+
+ return _COMPARE_OPERATOR_INVALID;
+}
+
+int test_order(int k, CompareOperator op) {
+
+ switch (op) {
+
+ case COMPARE_LOWER:
+ return k < 0;
+
+ case COMPARE_LOWER_OR_EQUAL:
+ return k <= 0;
+
+ case COMPARE_EQUAL:
+ return k == 0;
+
+ case COMPARE_UNEQUAL:
+ return k != 0;
+
+ case COMPARE_GREATER_OR_EQUAL:
+ return k >= 0;
+
+ case COMPARE_GREATER:
+ return k > 0;
+
+ default:
+ return -EINVAL;
+ }
+}
+
+int version_or_fnmatch_compare(
+ CompareOperator op,
+ const char *a,
+ const char *b) {
+ int r;
+
+ switch (op) {
+
+ case COMPARE_STRING_EQUAL:
+ return streq_ptr(a, b);
+
+ case COMPARE_STRING_UNEQUAL:
+ return !streq_ptr(a, b);
+
+ case COMPARE_FNMATCH_EQUAL:
+ r = fnmatch(b, a, 0);
+ return r == 0 ? true :
+ r == FNM_NOMATCH ? false : -EINVAL;
+
+ case COMPARE_FNMATCH_UNEQUAL:
+ r = fnmatch(b, a, 0);
+ return r == FNM_NOMATCH ? true:
+ r == 0 ? false : -EINVAL;
+
+ case _COMPARE_OPERATOR_ORDER_FIRST..._COMPARE_OPERATOR_ORDER_LAST:
+ return test_order(strverscmp_improved(a, b), op);
+
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/src/shared/compare-operator.h b/src/shared/compare-operator.h
new file mode 100644
index 0000000..900f3e5
--- /dev/null
+++ b/src/shared/compare-operator.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <stdbool.h>
+
+#define COMPARE_OPERATOR_CHARS "!<=>"
+#define COMPARE_OPERATOR_WITH_FNMATCH_CHARS COMPARE_OPERATOR_CHARS "$"
+
+typedef enum CompareOperator {
+ /* Listed in order of checking. Note that some comparators are prefixes of others, hence the longest
+ * should be listed first. */
+
+ /* Simple string compare operators */
+ _COMPARE_OPERATOR_STRING_FIRST,
+ COMPARE_STRING_EQUAL = _COMPARE_OPERATOR_STRING_FIRST,
+ COMPARE_STRING_UNEQUAL,
+ _COMPARE_OPERATOR_STRING_LAST = COMPARE_STRING_UNEQUAL,
+
+ /* fnmatch() compare operators */
+ _COMPARE_OPERATOR_FNMATCH_FIRST,
+ COMPARE_FNMATCH_EQUAL = _COMPARE_OPERATOR_FNMATCH_FIRST,
+ COMPARE_FNMATCH_UNEQUAL,
+ _COMPARE_OPERATOR_FNMATCH_LAST = COMPARE_FNMATCH_UNEQUAL,
+
+ /* Order compare operators */
+ _COMPARE_OPERATOR_ORDER_FIRST,
+ COMPARE_LOWER_OR_EQUAL = _COMPARE_OPERATOR_ORDER_FIRST,
+ COMPARE_GREATER_OR_EQUAL,
+ COMPARE_LOWER,
+ COMPARE_GREATER,
+ COMPARE_EQUAL,
+ COMPARE_UNEQUAL,
+ _COMPARE_OPERATOR_ORDER_LAST = COMPARE_UNEQUAL,
+
+ _COMPARE_OPERATOR_MAX,
+ _COMPARE_OPERATOR_INVALID = -EINVAL,
+} CompareOperator;
+
+static inline bool COMPARE_OPERATOR_IS_STRING(CompareOperator c) {
+ return c >= _COMPARE_OPERATOR_STRING_FIRST && c <= _COMPARE_OPERATOR_STRING_LAST;
+}
+
+static inline bool COMPARE_OPERATOR_IS_FNMATCH(CompareOperator c) {
+ return c >= _COMPARE_OPERATOR_FNMATCH_FIRST && c <= _COMPARE_OPERATOR_FNMATCH_LAST;
+}
+
+static inline bool COMPARE_OPERATOR_IS_ORDER(CompareOperator c) {
+ return c >= _COMPARE_OPERATOR_ORDER_FIRST && c <= _COMPARE_OPERATOR_ORDER_LAST;
+}
+
+typedef enum CompareOperatorParseFlags {
+ COMPARE_ALLOW_FNMATCH = 1 << 0,
+ COMPARE_EQUAL_BY_STRING = 1 << 1,
+ COMPARE_ALLOW_TEXTUAL = 1 << 2,
+} CompareOperatorParseFlags;
+
+CompareOperator parse_compare_operator(const char **s, CompareOperatorParseFlags flags);
+
+int test_order(int k, CompareOperator op);
+
+int version_or_fnmatch_compare(CompareOperator op, const char *a, const char *b);
diff --git a/src/shared/condition.c b/src/shared/condition.c
new file mode 100644
index 0000000..d3446e8
--- /dev/null
+++ b/src/shared/condition.c
@@ -0,0 +1,1360 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "apparmor-util.h"
+#include "architecture.h"
+#include "audit-util.h"
+#include "battery-util.h"
+#include "blockdev-util.h"
+#include "cap-list.h"
+#include "cgroup-util.h"
+#include "compare-operator.h"
+#include "condition.h"
+#include "confidential-virt.h"
+#include "cpu-set-util.h"
+#include "creds-util.h"
+#include "efi-api.h"
+#include "efi-loader.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "glob-util.h"
+#include "hostname-util.h"
+#include "ima-util.h"
+#include "initrd-util.h"
+#include "limits-util.h"
+#include "list.h"
+#include "macro.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "proc-cmdline.h"
+#include "process-util.h"
+#include "psi-util.h"
+#include "selinux-util.h"
+#include "smack-util.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "tomoyo-util.h"
+#include "tpm2-util.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+#include "virt.h"
+
+Condition* condition_new(ConditionType type, const char *parameter, bool trigger, bool negate) {
+ Condition *c;
+
+ assert(type >= 0);
+ assert(type < _CONDITION_TYPE_MAX);
+ assert(parameter);
+
+ c = new(Condition, 1);
+ if (!c)
+ return NULL;
+
+ *c = (Condition) {
+ .type = type,
+ .trigger = trigger,
+ .negate = negate,
+ };
+
+ if (parameter) {
+ c->parameter = strdup(parameter);
+ if (!c->parameter)
+ return mfree(c);
+ }
+
+ return c;
+}
+
+Condition* condition_free(Condition *c) {
+ assert(c);
+
+ free(c->parameter);
+ return mfree(c);
+}
+
+Condition* condition_free_list_type(Condition *head, ConditionType type) {
+ LIST_FOREACH(conditions, c, head)
+ if (type < 0 || c->type == type) {
+ LIST_REMOVE(conditions, head, c);
+ condition_free(c);
+ }
+
+ assert(type >= 0 || !head);
+ return head;
+}
+
+static int condition_test_kernel_command_line(Condition *c, char **env) {
+ _cleanup_strv_free_ char **args = NULL;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_KERNEL_COMMAND_LINE);
+
+ r = proc_cmdline_strv(&args);
+ if (r < 0)
+ return r;
+
+ bool equal = strchr(c->parameter, '=');
+
+ STRV_FOREACH(word, args) {
+ bool found;
+
+ if (equal)
+ found = streq(*word, c->parameter);
+ else {
+ const char *f;
+
+ f = startswith(*word, c->parameter);
+ found = f && IN_SET(*f, 0, '=');
+ }
+
+ if (found)
+ return true;
+ }
+
+ return false;
+}
+
+static int condition_test_credential(Condition *c, char **env) {
+ int (*gd)(const char **ret);
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_CREDENTIAL);
+
+ /* For now we'll do a very simple existence check and are happy with either a regular or an encrypted
+ * credential. Given that we check the syntax of the argument we have the option to later maybe allow
+ * contents checks too without breaking compatibility, but for now let's be minimalistic. */
+
+ if (!credential_name_valid(c->parameter)) /* credentials with invalid names do not exist */
+ return false;
+
+ FOREACH_POINTER(gd, get_credentials_dir, get_encrypted_credentials_dir) {
+ _cleanup_free_ char *j = NULL;
+ const char *cd;
+
+ r = gd(&cd);
+ if (r == -ENXIO) /* no env var set */
+ continue;
+ if (r < 0)
+ return r;
+
+ j = path_join(cd, c->parameter);
+ if (!j)
+ return -ENOMEM;
+
+ if (laccess(j, F_OK) >= 0)
+ return true; /* yay! */
+ if (errno != ENOENT)
+ return -errno;
+
+ /* not found in this dir */
+ }
+
+ return false;
+}
+
+static int condition_test_kernel_version(Condition *c, char **env) {
+ CompareOperator operator;
+ struct utsname u;
+ bool first = true;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_KERNEL_VERSION);
+
+ assert_se(uname(&u) >= 0);
+
+ for (const char *p = c->parameter;;) {
+ _cleanup_free_ char *word = NULL;
+ const char *s;
+ int r;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse condition string \"%s\": %m", p);
+ if (r == 0)
+ break;
+
+ s = strstrip(word);
+ operator = parse_compare_operator(&s, COMPARE_ALLOW_FNMATCH|COMPARE_EQUAL_BY_STRING);
+ if (operator < 0) /* No prefix? Then treat as glob string */
+ operator = COMPARE_FNMATCH_EQUAL;
+
+ s += strspn(s, WHITESPACE);
+ if (isempty(s)) {
+ if (first) {
+ /* For backwards compatibility, allow whitespace between the operator and
+ * value, without quoting, but only in the first expression. */
+ word = mfree(word);
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse condition string \"%s\": %m", p);
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected end of expression: %s", p);
+ s = word;
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unexpected end of expression: %s", p);
+ }
+
+ r = version_or_fnmatch_compare(operator, u.release, s);
+ if (r < 0)
+ return r;
+ if (!r)
+ return false;
+
+ first = false;
+ }
+
+ return true;
+}
+
+static int condition_test_osrelease(Condition *c, char **env) {
+ int r;
+
+ assert(c);
+ assert(c->type == CONDITION_OS_RELEASE);
+
+ for (const char *parameter = ASSERT_PTR(c->parameter);;) {
+ _cleanup_free_ char *key = NULL, *condition = NULL, *actual_value = NULL;
+ CompareOperator operator;
+ const char *word;
+
+ r = extract_first_word(&parameter, &condition, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse parameter: %m");
+ if (r == 0)
+ break;
+
+ /* parse_compare_operator() needs the string to start with the comparators */
+ word = condition;
+ r = extract_first_word(&word, &key, COMPARE_OPERATOR_WITH_FNMATCH_CHARS, EXTRACT_RETAIN_SEPARATORS);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse parameter: %m");
+ /* The os-release spec mandates env-var-like key names */
+ if (r == 0 || isempty(word) || !env_name_is_valid(key))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse parameter, key/value format expected: %m");
+
+ /* Do not allow whitespace after the separator, as that's not a valid os-release format */
+ operator = parse_compare_operator(&word, COMPARE_ALLOW_FNMATCH|COMPARE_EQUAL_BY_STRING);
+ if (operator < 0 || isempty(word) || strchr(WHITESPACE, *word) != NULL)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse parameter, key/value format expected: %m");
+
+ r = parse_os_release(NULL, key, &actual_value);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse os-release: %m");
+
+ r = version_or_fnmatch_compare(operator, actual_value, word);
+ if (r < 0)
+ return r;
+ if (!r)
+ return false;
+ }
+
+ return true;
+}
+
+static int condition_test_memory(Condition *c, char **env) {
+ CompareOperator operator;
+ uint64_t m, k;
+ const char *p;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_MEMORY);
+
+ m = physical_memory();
+
+ p = c->parameter;
+ operator = parse_compare_operator(&p, 0);
+ if (operator < 0)
+ operator = COMPARE_GREATER_OR_EQUAL; /* default to >= check, if nothing is specified. */
+
+ r = parse_size(p, 1024, &k);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse size '%s': %m", p);
+
+ return test_order(CMP(m, k), operator);
+}
+
+static int condition_test_cpus(Condition *c, char **env) {
+ CompareOperator operator;
+ const char *p;
+ unsigned k;
+ int r, n;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_CPUS);
+
+ n = cpus_in_affinity_mask();
+ if (n < 0)
+ return log_debug_errno(n, "Failed to determine CPUs in affinity mask: %m");
+
+ p = c->parameter;
+ operator = parse_compare_operator(&p, 0);
+ if (operator < 0)
+ operator = COMPARE_GREATER_OR_EQUAL; /* default to >= check, if nothing is specified. */
+
+ r = safe_atou(p, &k);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse number of CPUs: %m");
+
+ return test_order(CMP((unsigned) n, k), operator);
+}
+
+static int condition_test_user(Condition *c, char **env) {
+ uid_t id;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_USER);
+
+ /* Do the quick&easy comparisons first, and only parse the UID later. */
+ if (streq(c->parameter, "root"))
+ return getuid() == 0 || geteuid() == 0;
+ if (streq(c->parameter, NOBODY_USER_NAME))
+ return getuid() == UID_NOBODY || geteuid() == UID_NOBODY;
+ if (streq(c->parameter, "@system"))
+ return uid_is_system(getuid()) || uid_is_system(geteuid());
+
+ r = parse_uid(c->parameter, &id);
+ if (r >= 0)
+ return id == getuid() || id == geteuid();
+
+ if (getpid_cached() == 1) /* We already checked for "root" above, and we know that
+ * PID 1 is running as root, hence we know it cannot match. */
+ return false;
+
+ /* getusername_malloc() may do an nss lookup, which is not allowed in PID 1. */
+ _cleanup_free_ char *username = getusername_malloc();
+ if (!username)
+ return -ENOMEM;
+
+ if (streq(username, c->parameter))
+ return 1;
+
+ const char *u = c->parameter;
+ r = get_user_creds(&u, &id, NULL, NULL, NULL, USER_CREDS_ALLOW_MISSING);
+ if (r < 0)
+ return 0;
+
+ return id == getuid() || id == geteuid();
+}
+
+static int condition_test_control_group_controller(Condition *c, char **env) {
+ int r;
+ CGroupMask system_mask, wanted_mask = 0;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_CONTROL_GROUP_CONTROLLER);
+
+ if (streq(c->parameter, "v2"))
+ return cg_all_unified();
+ if (streq(c->parameter, "v1")) {
+ r = cg_all_unified();
+ if (r < 0)
+ return r;
+ return !r;
+ }
+
+ r = cg_mask_supported(&system_mask);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine supported controllers: %m");
+
+ r = cg_mask_from_string(c->parameter, &wanted_mask);
+ if (r < 0 || wanted_mask <= 0) {
+ /* This won't catch the case that we have an unknown controller
+ * mixed in with valid ones -- these are only assessed on the
+ * validity of the valid controllers found. */
+ log_debug("Failed to parse cgroup string: %s", c->parameter);
+ return 1;
+ }
+
+ return FLAGS_SET(system_mask, wanted_mask);
+}
+
+static int condition_test_group(Condition *c, char **env) {
+ gid_t id;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_GROUP);
+
+ r = parse_gid(c->parameter, &id);
+ if (r >= 0)
+ return in_gid(id);
+
+ /* Avoid any NSS lookups if we are PID1 */
+ if (getpid_cached() == 1)
+ return streq(c->parameter, "root");
+
+ return in_group(c->parameter) > 0;
+}
+
+static int condition_test_virtualization(Condition *c, char **env) {
+ Virtualization v;
+ int b;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_VIRTUALIZATION);
+
+ if (streq(c->parameter, "private-users"))
+ return running_in_userns();
+
+ v = detect_virtualization();
+ if (v < 0)
+ return v;
+
+ /* First, compare with yes/no */
+ b = parse_boolean(c->parameter);
+ if (b >= 0)
+ return b == (v != VIRTUALIZATION_NONE);
+
+ /* Then, compare categorization */
+ if (streq(c->parameter, "vm"))
+ return VIRTUALIZATION_IS_VM(v);
+
+ if (streq(c->parameter, "container"))
+ return VIRTUALIZATION_IS_CONTAINER(v);
+
+ /* Finally compare id */
+ return v != VIRTUALIZATION_NONE && streq(c->parameter, virtualization_to_string(v));
+}
+
+static int condition_test_architecture(Condition *c, char **env) {
+ Architecture a, b;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_ARCHITECTURE);
+
+ a = uname_architecture();
+ if (a < 0)
+ return a;
+
+ if (streq(c->parameter, "native"))
+ b = native_architecture();
+ else {
+ b = architecture_from_string(c->parameter);
+ if (b < 0) /* unknown architecture? Then it's definitely not ours */
+ return false;
+ }
+
+ return a == b;
+}
+
+#define DTCOMPAT_FILE "/proc/device-tree/compatible"
+static int condition_test_firmware_devicetree_compatible(const char *dtcarg) {
+ int r;
+ _cleanup_free_ char *dtcompat = NULL;
+ _cleanup_strv_free_ char **dtcompatlist = NULL;
+ size_t size;
+
+ r = read_full_virtual_file(DTCOMPAT_FILE, &dtcompat, &size);
+ if (r < 0) {
+ /* if the path doesn't exist it is incompatible */
+ if (r != -ENOENT)
+ log_debug_errno(r, "Failed to open() '%s', assuming machine is incompatible: %m", DTCOMPAT_FILE);
+ return false;
+ }
+
+ /* Not sure this can happen, but play safe. */
+ if (size == 0) {
+ log_debug("%s has zero length, assuming machine is incompatible", DTCOMPAT_FILE);
+ return false;
+ }
+
+ /* /proc/device-tree/compatible consists of one or more strings, each ending in '\0'.
+ * So the last character in dtcompat must be a '\0'. */
+ if (dtcompat[size - 1] != '\0') {
+ log_debug("%s is in an unknown format, assuming machine is incompatible", DTCOMPAT_FILE);
+ return false;
+ }
+
+ dtcompatlist = strv_parse_nulstr(dtcompat, size);
+ if (!dtcompatlist)
+ return -ENOMEM;
+
+ return strv_contains(dtcompatlist, dtcarg);
+}
+
+static int condition_test_firmware_smbios_field(const char *expression) {
+ _cleanup_free_ char *field = NULL, *expected_value = NULL, *actual_value = NULL;
+ CompareOperator operator;
+ int r;
+
+ assert(expression);
+
+ /* Parse SMBIOS field */
+ r = extract_first_word(&expression, &field, COMPARE_OPERATOR_WITH_FNMATCH_CHARS, EXTRACT_RETAIN_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0 || isempty(expression))
+ return -EINVAL;
+
+ /* Remove trailing spaces from SMBIOS field */
+ delete_trailing_chars(field, WHITESPACE);
+
+ /* Parse operator */
+ operator = parse_compare_operator(&expression, COMPARE_ALLOW_FNMATCH|COMPARE_EQUAL_BY_STRING);
+ if (operator < 0)
+ return operator;
+
+ /* Parse expected value */
+ r = extract_first_word(&expression, &expected_value, NULL, EXTRACT_UNQUOTE);
+ if (r < 0)
+ return r;
+ if (r == 0 || !isempty(expression))
+ return -EINVAL;
+
+ /* Read actual value from sysfs */
+ if (!filename_is_valid(field))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid SMBIOS field name");
+
+ const char *p = strjoina("/sys/class/dmi/id/", field);
+ r = read_virtual_file(p, SIZE_MAX, &actual_value, NULL);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read %s: %m", p);
+ if (r == -ENOENT)
+ return false;
+ return r;
+ }
+
+ /* Remove trailing newline */
+ delete_trailing_chars(actual_value, WHITESPACE);
+
+ /* Finally compare actual and expected value */
+ return version_or_fnmatch_compare(operator, actual_value, expected_value);
+}
+
+static int condition_test_firmware(Condition *c, char **env) {
+ sd_char *arg;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_FIRMWARE);
+
+ if (streq(c->parameter, "device-tree")) {
+ if (access("/sys/firmware/devicetree/", F_OK) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Unexpected error when checking for /sys/firmware/devicetree/: %m");
+ return false;
+ } else
+ return true;
+ } else if ((arg = startswith(c->parameter, "device-tree-compatible("))) {
+ _cleanup_free_ char *dtc_arg = NULL;
+ char *end;
+
+ end = strrchr(arg, ')');
+ if (!end || *(end + 1) != '\0') {
+ log_debug("Malformed ConditionFirmware=%s", c->parameter);
+ return false;
+ }
+
+ dtc_arg = strndup(arg, end - arg);
+ if (!dtc_arg)
+ return -ENOMEM;
+
+ return condition_test_firmware_devicetree_compatible(dtc_arg);
+ } else if (streq(c->parameter, "uefi"))
+ return is_efi_boot();
+ else if ((arg = startswith(c->parameter, "smbios-field("))) {
+ _cleanup_free_ char *smbios_arg = NULL;
+ char *end;
+
+ end = strrchr(arg, ')');
+ if (!end || *(end + 1) != '\0')
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Malformed ConditionFirmware=%s: %m", c->parameter);
+
+ smbios_arg = strndup(arg, end - arg);
+ if (!smbios_arg)
+ return log_oom_debug();
+
+ r = condition_test_firmware_smbios_field(smbios_arg);
+ if (r < 0)
+ return log_debug_errno(r, "Malformed ConditionFirmware=%s: %m", c->parameter);
+ return r;
+ } else {
+ log_debug("Unsupported Firmware condition \"%s\"", c->parameter);
+ return false;
+ }
+}
+
+static int condition_test_host(Condition *c, char **env) {
+ _cleanup_free_ char *h = NULL;
+ sd_id128_t x, y;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_HOST);
+
+ if (sd_id128_from_string(c->parameter, &x) >= 0) {
+
+ r = sd_id128_get_machine(&y);
+ if (r < 0)
+ return r;
+
+ return sd_id128_equal(x, y);
+ }
+
+ h = gethostname_malloc();
+ if (!h)
+ return -ENOMEM;
+
+ r = fnmatch(c->parameter, h, FNM_CASEFOLD);
+ if (r == FNM_NOMATCH)
+ return false;
+ if (r != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "fnmatch() failed.");
+
+ return true;
+}
+
+static int condition_test_ac_power(Condition *c, char **env) {
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_AC_POWER);
+
+ r = parse_boolean(c->parameter);
+ if (r < 0)
+ return r;
+
+ return (on_ac_power() != 0) == !!r;
+}
+
+static int has_tpm2(void) {
+ /* Checks whether the kernel has the TPM subsystem enabled and the firmware reports support. Note
+ * we don't check for actual TPM devices, since we might not have loaded the driver for it yet, i.e.
+ * during early boot where we very likely want to use this condition check).
+ *
+ * Note that we don't check if we ourselves are built with TPM2 support here! */
+
+ return FLAGS_SET(tpm2_support(), TPM2_SUPPORT_SUBSYSTEM|TPM2_SUPPORT_FIRMWARE);
+}
+
+static int condition_test_security(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_SECURITY);
+
+ if (streq(c->parameter, "selinux"))
+ return mac_selinux_use();
+ if (streq(c->parameter, "smack"))
+ return mac_smack_use();
+ if (streq(c->parameter, "apparmor"))
+ return mac_apparmor_use();
+ if (streq(c->parameter, "audit"))
+ return use_audit();
+ if (streq(c->parameter, "ima"))
+ return use_ima();
+ if (streq(c->parameter, "tomoyo"))
+ return mac_tomoyo_use();
+ if (streq(c->parameter, "uefi-secureboot"))
+ return is_efi_secure_boot();
+ if (streq(c->parameter, "tpm2"))
+ return has_tpm2();
+ if (streq(c->parameter, "cvm"))
+ return detect_confidential_virtualization() > 0;
+ if (streq(c->parameter, "measured-uki"))
+ return efi_measured_uki(LOG_DEBUG);
+
+ return false;
+}
+
+static int condition_test_capability(Condition *c, char **env) {
+ unsigned long long capabilities = (unsigned long long) -1;
+ _cleanup_fclose_ FILE *f = NULL;
+ int value, r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_CAPABILITY);
+
+ /* If it's an invalid capability, we don't have it */
+ value = capability_from_name(c->parameter);
+ if (value < 0)
+ return -EINVAL;
+
+ /* If it's a valid capability we default to assume
+ * that we have it */
+
+ f = fopen("/proc/self/status", "re");
+ if (!f)
+ return -errno;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ const char *p = startswith(line, "CapBnd:");
+ if (p) {
+ if (sscanf(p, "%llx", &capabilities) != 1)
+ return -EIO;
+
+ break;
+ }
+ }
+
+ return !!(capabilities & (1ULL << value));
+}
+
+static int condition_test_needs_update(Condition *c, char **env) {
+ struct stat usr, other;
+ const char *p;
+ bool b;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_NEEDS_UPDATE);
+
+ r = proc_cmdline_get_bool("systemd.condition-needs-update", /* flags = */ 0, &b);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse systemd.condition-needs-update= kernel command line argument, ignoring: %m");
+ if (r > 0)
+ return b;
+
+ if (in_initrd()) {
+ log_debug("We are in an initrd, not doing any updates.");
+ return false;
+ }
+
+ if (!path_is_absolute(c->parameter)) {
+ log_debug("Specified condition parameter '%s' is not absolute, assuming an update is needed.", c->parameter);
+ return true;
+ }
+
+ /* If the file system is read-only we shouldn't suggest an update */
+ r = path_is_read_only_fs(c->parameter);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine if '%s' is read-only, ignoring: %m", c->parameter);
+ if (r > 0)
+ return false;
+
+ /* Any other failure means we should allow the condition to be true, so that we rather invoke too
+ * many update tools than too few. */
+
+ p = strjoina(c->parameter, "/.updated");
+ if (lstat(p, &other) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Failed to stat() '%s', assuming an update is needed: %m", p);
+ return true;
+ }
+
+ if (lstat("/usr/", &usr) < 0) {
+ log_debug_errno(errno, "Failed to stat() /usr/, assuming an update is needed: %m");
+ return true;
+ }
+
+ /*
+ * First, compare seconds as they are always accurate...
+ */
+ if (usr.st_mtim.tv_sec != other.st_mtim.tv_sec)
+ return usr.st_mtim.tv_sec > other.st_mtim.tv_sec;
+
+ /*
+ * ...then compare nanoseconds.
+ *
+ * A false positive is only possible when /usr's nanoseconds > 0
+ * (otherwise /usr cannot be strictly newer than the target file)
+ * AND the target file's nanoseconds == 0
+ * (otherwise the filesystem supports nsec timestamps, see stat(2)).
+ */
+ if (usr.st_mtim.tv_nsec == 0 || other.st_mtim.tv_nsec > 0)
+ return usr.st_mtim.tv_nsec > other.st_mtim.tv_nsec;
+
+ _cleanup_free_ char *timestamp_str = NULL;
+ r = parse_env_file(NULL, p, "TIMESTAMP_NSEC", &timestamp_str);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to parse timestamp file '%s', using mtime: %m", p);
+ return true;
+ }
+ if (isempty(timestamp_str)) {
+ log_debug("No data in timestamp file '%s', using mtime.", p);
+ return true;
+ }
+
+ uint64_t timestamp;
+ r = safe_atou64(timestamp_str, &timestamp);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to parse timestamp value '%s' in file '%s', using mtime: %m", timestamp_str, p);
+ return true;
+ }
+
+ return timespec_load_nsec(&usr.st_mtim) > timestamp;
+}
+
+static bool in_first_boot(void) {
+ static int first_boot = -1;
+ int r;
+
+ if (first_boot >= 0)
+ return first_boot;
+
+ const char *e = secure_getenv("SYSTEMD_FIRST_BOOT");
+ if (e) {
+ r = parse_boolean(e);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_FIRST_BOOT, ignoring: %m");
+ else
+ return (first_boot = r);
+ }
+
+ r = RET_NERRNO(access("/run/systemd/first-boot", F_OK));
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to check if /run/systemd/first-boot exists, assuming no: %m");
+ return r >= 0;
+}
+
+static int condition_test_first_boot(Condition *c, char **env) {
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_FIRST_BOOT);
+
+ // TODO: Parse c->parameter immediately when reading the config.
+ // Apply negation when parsing too.
+
+ r = parse_boolean(c->parameter);
+ if (r < 0)
+ return r;
+
+ return in_first_boot() == r;
+}
+
+static int condition_test_environment(Condition *c, char **env) {
+ bool equal;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_ENVIRONMENT);
+
+ equal = strchr(c->parameter, '=');
+
+ STRV_FOREACH(i, env) {
+ bool found;
+
+ if (equal)
+ found = streq(c->parameter, *i);
+ else {
+ const char *f;
+
+ f = startswith(*i, c->parameter);
+ found = f && IN_SET(*f, 0, '=');
+ }
+
+ if (found)
+ return true;
+ }
+
+ return false;
+}
+
+static int condition_test_path_exists(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_EXISTS);
+
+ return access(c->parameter, F_OK) >= 0;
+}
+
+static int condition_test_path_exists_glob(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_EXISTS_GLOB);
+
+ return glob_exists(c->parameter) > 0;
+}
+
+static int condition_test_path_is_directory(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_IS_DIRECTORY);
+
+ return is_dir(c->parameter, true) > 0;
+}
+
+static int condition_test_path_is_symbolic_link(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_IS_SYMBOLIC_LINK);
+
+ return is_symlink(c->parameter) > 0;
+}
+
+static int condition_test_path_is_mount_point(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_IS_MOUNT_POINT);
+
+ return path_is_mount_point(c->parameter, NULL, AT_SYMLINK_FOLLOW) > 0;
+}
+
+static int condition_test_path_is_read_write(Condition *c, char **env) {
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_IS_READ_WRITE);
+
+ r = path_is_read_only_fs(c->parameter);
+
+ return r <= 0 && r != -ENOENT;
+}
+
+static int condition_test_cpufeature(Condition *c, char **env) {
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_CPU_FEATURE);
+
+ return has_cpu_with_flag(ascii_strlower(c->parameter));
+}
+
+static int condition_test_path_is_encrypted(Condition *c, char **env) {
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_PATH_IS_ENCRYPTED);
+
+ r = path_is_encrypted(c->parameter);
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to determine if '%s' is encrypted: %m", c->parameter);
+
+ return r > 0;
+}
+
+static int condition_test_directory_not_empty(Condition *c, char **env) {
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_DIRECTORY_NOT_EMPTY);
+
+ r = dir_is_empty(c->parameter, /* ignore_hidden_or_backup= */ true);
+ return r <= 0 && !IN_SET(r, -ENOENT, -ENOTDIR);
+}
+
+static int condition_test_file_not_empty(Condition *c, char **env) {
+ struct stat st;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_FILE_NOT_EMPTY);
+
+ return (stat(c->parameter, &st) >= 0 &&
+ S_ISREG(st.st_mode) &&
+ st.st_size > 0);
+}
+
+static int condition_test_file_is_executable(Condition *c, char **env) {
+ struct stat st;
+
+ assert(c);
+ assert(c->parameter);
+ assert(c->type == CONDITION_FILE_IS_EXECUTABLE);
+
+ return (stat(c->parameter, &st) >= 0 &&
+ S_ISREG(st.st_mode) &&
+ (st.st_mode & 0111));
+}
+
+static int condition_test_psi(Condition *c, char **env) {
+ _cleanup_free_ char *first = NULL, *second = NULL, *third = NULL, *fourth = NULL, *pressure_path = NULL;
+ const char *p, *value, *pressure_type;
+ loadavg_t *current, limit;
+ ResourcePressure pressure;
+ int r;
+
+ assert(c);
+ assert(c->parameter);
+ assert(IN_SET(c->type, CONDITION_MEMORY_PRESSURE, CONDITION_CPU_PRESSURE, CONDITION_IO_PRESSURE));
+
+ if (!is_pressure_supported()) {
+ log_debug("Pressure Stall Information (PSI) is not supported, skipping.");
+ return 1;
+ }
+
+ pressure_type = c->type == CONDITION_MEMORY_PRESSURE ? "memory" :
+ c->type == CONDITION_CPU_PRESSURE ? "cpu" :
+ "io";
+
+ p = c->parameter;
+ r = extract_many_words(&p, ":", 0, &first, &second, NULL);
+ if (r <= 0)
+ return log_debug_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter);
+ /* If only one parameter is passed, then we look at the global system pressure rather than a specific cgroup. */
+ if (r == 1) {
+ pressure_path = path_join("/proc/pressure", pressure_type);
+ if (!pressure_path)
+ return log_oom_debug();
+
+ value = first;
+ } else {
+ const char *controller = strjoina(pressure_type, ".pressure");
+ _cleanup_free_ char *slice_path = NULL, *root_scope = NULL;
+ CGroupMask mask, required_mask;
+ char *slice, *e;
+
+ required_mask = c->type == CONDITION_MEMORY_PRESSURE ? CGROUP_MASK_MEMORY :
+ c->type == CONDITION_CPU_PRESSURE ? CGROUP_MASK_CPU :
+ CGROUP_MASK_IO;
+
+ slice = strstrip(first);
+ if (!slice)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter);
+
+ r = cg_all_unified();
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
+ if (r == 0) {
+ log_debug("PSI condition check requires the unified cgroups hierarchy, skipping.");
+ return 1;
+ }
+
+ r = cg_mask_supported(&mask);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get supported cgroup controllers: %m");
+
+ if (!FLAGS_SET(mask, required_mask)) {
+ log_debug("Cgroup %s controller not available, skipping PSI condition check.", pressure_type);
+ return 1;
+ }
+
+ r = cg_slice_to_path(slice, &slice_path);
+ if (r < 0)
+ return log_debug_errno(r, "Cannot determine slice \"%s\" cgroup path: %m", slice);
+
+ /* We might be running under the user manager, so get the root path and prefix it accordingly. */
+ r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, getpid_cached(), &root_scope);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get root cgroup path: %m");
+
+ /* Drop init.scope, we want the parent. We could get an empty or / path, but that's fine,
+ * just skip it in that case. */
+ e = endswith(root_scope, "/" SPECIAL_INIT_SCOPE);
+ if (e)
+ *e = 0;
+ if (!empty_or_root(root_scope)) {
+ _cleanup_free_ char *slice_joined = NULL;
+
+ slice_joined = path_join(root_scope, slice_path);
+ if (!slice_joined)
+ return log_oom_debug();
+
+ free_and_replace(slice_path, slice_joined);
+ }
+
+ r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, slice_path, controller, &pressure_path);
+ if (r < 0)
+ return log_debug_errno(r, "Error getting cgroup pressure path from %s: %m", slice_path);
+
+ value = second;
+ }
+
+ /* If a value including a specific timespan (in the intervals allowed by the kernel),
+ * parse it, otherwise we assume just a plain percentage that will be checked if it is
+ * smaller or equal to the current pressure average over 5 minutes. */
+ r = extract_many_words(&value, "/", 0, &third, &fourth, NULL);
+ if (r <= 0)
+ return log_debug_errno(r < 0 ? r : SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter);
+ if (r == 1)
+ current = &pressure.avg300;
+ else {
+ const char *timespan;
+
+ timespan = skip_leading_chars(fourth, NULL);
+ if (!timespan)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter);
+
+ if (startswith(timespan, "10sec"))
+ current = &pressure.avg10;
+ else if (startswith(timespan, "1min"))
+ current = &pressure.avg60;
+ else if (startswith(timespan, "5min"))
+ current = &pressure.avg300;
+ else
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter);
+ }
+
+ value = strstrip(third);
+ if (!value)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse condition parameter %s: %m", c->parameter);
+
+ r = parse_permyriad(value);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse permyriad: %s", c->parameter);
+
+ r = store_loadavg_fixed_point(r / 100LU, r % 100LU, &limit);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse loadavg: %s", c->parameter);
+
+ r = read_resource_pressure(pressure_path, PRESSURE_TYPE_FULL, &pressure);
+ if (r == -ENODATA) /* cpu.pressure 'full' was added recently, fall back to 'some'. */
+ r = read_resource_pressure(pressure_path, PRESSURE_TYPE_SOME, &pressure);
+ if (r == -ENOENT) {
+ /* We already checked that /proc/pressure exists, so this means we were given a cgroup
+ * that doesn't exist or doesn't exist any longer. */
+ log_debug("\"%s\" not found, skipping PSI check.", pressure_path);
+ return 1;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Error parsing pressure from %s: %m", pressure_path);
+
+ return *current <= limit;
+}
+
+int condition_test(Condition *c, char **env) {
+
+ static int (*const condition_tests[_CONDITION_TYPE_MAX])(Condition *c, char **env) = {
+ [CONDITION_PATH_EXISTS] = condition_test_path_exists,
+ [CONDITION_PATH_EXISTS_GLOB] = condition_test_path_exists_glob,
+ [CONDITION_PATH_IS_DIRECTORY] = condition_test_path_is_directory,
+ [CONDITION_PATH_IS_SYMBOLIC_LINK] = condition_test_path_is_symbolic_link,
+ [CONDITION_PATH_IS_MOUNT_POINT] = condition_test_path_is_mount_point,
+ [CONDITION_PATH_IS_READ_WRITE] = condition_test_path_is_read_write,
+ [CONDITION_PATH_IS_ENCRYPTED] = condition_test_path_is_encrypted,
+ [CONDITION_DIRECTORY_NOT_EMPTY] = condition_test_directory_not_empty,
+ [CONDITION_FILE_NOT_EMPTY] = condition_test_file_not_empty,
+ [CONDITION_FILE_IS_EXECUTABLE] = condition_test_file_is_executable,
+ [CONDITION_KERNEL_COMMAND_LINE] = condition_test_kernel_command_line,
+ [CONDITION_KERNEL_VERSION] = condition_test_kernel_version,
+ [CONDITION_CREDENTIAL] = condition_test_credential,
+ [CONDITION_VIRTUALIZATION] = condition_test_virtualization,
+ [CONDITION_SECURITY] = condition_test_security,
+ [CONDITION_CAPABILITY] = condition_test_capability,
+ [CONDITION_HOST] = condition_test_host,
+ [CONDITION_AC_POWER] = condition_test_ac_power,
+ [CONDITION_ARCHITECTURE] = condition_test_architecture,
+ [CONDITION_FIRMWARE] = condition_test_firmware,
+ [CONDITION_NEEDS_UPDATE] = condition_test_needs_update,
+ [CONDITION_FIRST_BOOT] = condition_test_first_boot,
+ [CONDITION_USER] = condition_test_user,
+ [CONDITION_GROUP] = condition_test_group,
+ [CONDITION_CONTROL_GROUP_CONTROLLER] = condition_test_control_group_controller,
+ [CONDITION_CPUS] = condition_test_cpus,
+ [CONDITION_MEMORY] = condition_test_memory,
+ [CONDITION_ENVIRONMENT] = condition_test_environment,
+ [CONDITION_CPU_FEATURE] = condition_test_cpufeature,
+ [CONDITION_OS_RELEASE] = condition_test_osrelease,
+ [CONDITION_MEMORY_PRESSURE] = condition_test_psi,
+ [CONDITION_CPU_PRESSURE] = condition_test_psi,
+ [CONDITION_IO_PRESSURE] = condition_test_psi,
+ };
+
+ int r, b;
+
+ assert(c);
+ assert(c->type >= 0);
+ assert(c->type < _CONDITION_TYPE_MAX);
+
+ r = condition_tests[c->type](c, env);
+ if (r < 0) {
+ c->result = CONDITION_ERROR;
+ return r;
+ }
+
+ b = (r > 0) == !c->negate;
+ c->result = b ? CONDITION_SUCCEEDED : CONDITION_FAILED;
+ return b;
+}
+
+bool condition_test_list(
+ Condition *first,
+ char **env,
+ condition_to_string_t to_string,
+ condition_test_logger_t logger,
+ void *userdata) {
+
+ int triggered = -1;
+
+ /* If the condition list is empty, then it is true */
+ if (!first)
+ return true;
+
+ /* Otherwise, if all of the non-trigger conditions apply and
+ * if any of the trigger conditions apply (unless there are
+ * none) we return true */
+ LIST_FOREACH(conditions, c, first) {
+ int r;
+
+ r = condition_test(c, env);
+
+ if (logger) {
+ if (r < 0)
+ logger(userdata, LOG_WARNING, r, PROJECT_FILE, __LINE__, __func__,
+ "Couldn't determine result for %s=%s%s%s, assuming failed: %m",
+ to_string(c->type),
+ c->trigger ? "|" : "",
+ c->negate ? "!" : "",
+ c->parameter);
+ else
+ logger(userdata, LOG_DEBUG, 0, PROJECT_FILE, __LINE__, __func__,
+ "%s=%s%s%s %s.",
+ to_string(c->type),
+ c->trigger ? "|" : "",
+ c->negate ? "!" : "",
+ c->parameter,
+ condition_result_to_string(c->result));
+ }
+
+ if (!c->trigger && r <= 0)
+ return false;
+
+ if (c->trigger && triggered <= 0)
+ triggered = r > 0;
+ }
+
+ return triggered != 0;
+}
+
+void condition_dump(Condition *c, FILE *f, const char *prefix, condition_to_string_t to_string) {
+ assert(c);
+ assert(f);
+ assert(to_string);
+
+ prefix = strempty(prefix);
+
+ fprintf(f,
+ "%s\t%s: %s%s%s %s\n",
+ prefix,
+ to_string(c->type),
+ c->trigger ? "|" : "",
+ c->negate ? "!" : "",
+ c->parameter,
+ condition_result_to_string(c->result));
+}
+
+void condition_dump_list(Condition *first, FILE *f, const char *prefix, condition_to_string_t to_string) {
+ LIST_FOREACH(conditions, c, first)
+ condition_dump(c, f, prefix, to_string);
+}
+
+static const char* const condition_type_table[_CONDITION_TYPE_MAX] = {
+ [CONDITION_ARCHITECTURE] = "ConditionArchitecture",
+ [CONDITION_FIRMWARE] = "ConditionFirmware",
+ [CONDITION_VIRTUALIZATION] = "ConditionVirtualization",
+ [CONDITION_HOST] = "ConditionHost",
+ [CONDITION_KERNEL_COMMAND_LINE] = "ConditionKernelCommandLine",
+ [CONDITION_KERNEL_VERSION] = "ConditionKernelVersion",
+ [CONDITION_CREDENTIAL] = "ConditionCredential",
+ [CONDITION_SECURITY] = "ConditionSecurity",
+ [CONDITION_CAPABILITY] = "ConditionCapability",
+ [CONDITION_AC_POWER] = "ConditionACPower",
+ [CONDITION_NEEDS_UPDATE] = "ConditionNeedsUpdate",
+ [CONDITION_FIRST_BOOT] = "ConditionFirstBoot",
+ [CONDITION_PATH_EXISTS] = "ConditionPathExists",
+ [CONDITION_PATH_EXISTS_GLOB] = "ConditionPathExistsGlob",
+ [CONDITION_PATH_IS_DIRECTORY] = "ConditionPathIsDirectory",
+ [CONDITION_PATH_IS_SYMBOLIC_LINK] = "ConditionPathIsSymbolicLink",
+ [CONDITION_PATH_IS_MOUNT_POINT] = "ConditionPathIsMountPoint",
+ [CONDITION_PATH_IS_READ_WRITE] = "ConditionPathIsReadWrite",
+ [CONDITION_PATH_IS_ENCRYPTED] = "ConditionPathIsEncrypted",
+ [CONDITION_DIRECTORY_NOT_EMPTY] = "ConditionDirectoryNotEmpty",
+ [CONDITION_FILE_NOT_EMPTY] = "ConditionFileNotEmpty",
+ [CONDITION_FILE_IS_EXECUTABLE] = "ConditionFileIsExecutable",
+ [CONDITION_USER] = "ConditionUser",
+ [CONDITION_GROUP] = "ConditionGroup",
+ [CONDITION_CONTROL_GROUP_CONTROLLER] = "ConditionControlGroupController",
+ [CONDITION_CPUS] = "ConditionCPUs",
+ [CONDITION_MEMORY] = "ConditionMemory",
+ [CONDITION_ENVIRONMENT] = "ConditionEnvironment",
+ [CONDITION_CPU_FEATURE] = "ConditionCPUFeature",
+ [CONDITION_OS_RELEASE] = "ConditionOSRelease",
+ [CONDITION_MEMORY_PRESSURE] = "ConditionMemoryPressure",
+ [CONDITION_CPU_PRESSURE] = "ConditionCPUPressure",
+ [CONDITION_IO_PRESSURE] = "ConditionIOPressure",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(condition_type, ConditionType);
+
+static const char* const assert_type_table[_CONDITION_TYPE_MAX] = {
+ [CONDITION_ARCHITECTURE] = "AssertArchitecture",
+ [CONDITION_FIRMWARE] = "AssertFirmware",
+ [CONDITION_VIRTUALIZATION] = "AssertVirtualization",
+ [CONDITION_HOST] = "AssertHost",
+ [CONDITION_KERNEL_COMMAND_LINE] = "AssertKernelCommandLine",
+ [CONDITION_KERNEL_VERSION] = "AssertKernelVersion",
+ [CONDITION_CREDENTIAL] = "AssertCredential",
+ [CONDITION_SECURITY] = "AssertSecurity",
+ [CONDITION_CAPABILITY] = "AssertCapability",
+ [CONDITION_AC_POWER] = "AssertACPower",
+ [CONDITION_NEEDS_UPDATE] = "AssertNeedsUpdate",
+ [CONDITION_FIRST_BOOT] = "AssertFirstBoot",
+ [CONDITION_PATH_EXISTS] = "AssertPathExists",
+ [CONDITION_PATH_EXISTS_GLOB] = "AssertPathExistsGlob",
+ [CONDITION_PATH_IS_DIRECTORY] = "AssertPathIsDirectory",
+ [CONDITION_PATH_IS_SYMBOLIC_LINK] = "AssertPathIsSymbolicLink",
+ [CONDITION_PATH_IS_MOUNT_POINT] = "AssertPathIsMountPoint",
+ [CONDITION_PATH_IS_READ_WRITE] = "AssertPathIsReadWrite",
+ [CONDITION_PATH_IS_ENCRYPTED] = "AssertPathIsEncrypted",
+ [CONDITION_DIRECTORY_NOT_EMPTY] = "AssertDirectoryNotEmpty",
+ [CONDITION_FILE_NOT_EMPTY] = "AssertFileNotEmpty",
+ [CONDITION_FILE_IS_EXECUTABLE] = "AssertFileIsExecutable",
+ [CONDITION_USER] = "AssertUser",
+ [CONDITION_GROUP] = "AssertGroup",
+ [CONDITION_CONTROL_GROUP_CONTROLLER] = "AssertControlGroupController",
+ [CONDITION_CPUS] = "AssertCPUs",
+ [CONDITION_MEMORY] = "AssertMemory",
+ [CONDITION_ENVIRONMENT] = "AssertEnvironment",
+ [CONDITION_CPU_FEATURE] = "AssertCPUFeature",
+ [CONDITION_OS_RELEASE] = "AssertOSRelease",
+ [CONDITION_MEMORY_PRESSURE] = "AssertMemoryPressure",
+ [CONDITION_CPU_PRESSURE] = "AssertCPUPressure",
+ [CONDITION_IO_PRESSURE] = "AssertIOPressure",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(assert_type, ConditionType);
+
+static const char* const condition_result_table[_CONDITION_RESULT_MAX] = {
+ [CONDITION_UNTESTED] = "untested",
+ [CONDITION_SUCCEEDED] = "succeeded",
+ [CONDITION_FAILED] = "failed",
+ [CONDITION_ERROR] = "error",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(condition_result, ConditionResult);
diff --git a/src/shared/condition.h b/src/shared/condition.h
new file mode 100644
index 0000000..54cc904
--- /dev/null
+++ b/src/shared/condition.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "list.h"
+#include "macro.h"
+
+typedef enum ConditionType {
+ CONDITION_ARCHITECTURE,
+ CONDITION_FIRMWARE,
+ CONDITION_VIRTUALIZATION,
+ CONDITION_HOST,
+ CONDITION_KERNEL_COMMAND_LINE,
+ CONDITION_KERNEL_VERSION,
+ CONDITION_CREDENTIAL,
+ CONDITION_SECURITY,
+ CONDITION_CAPABILITY,
+ CONDITION_AC_POWER,
+ CONDITION_MEMORY,
+ CONDITION_CPUS,
+ CONDITION_ENVIRONMENT,
+ CONDITION_CPU_FEATURE,
+ CONDITION_OS_RELEASE,
+ CONDITION_MEMORY_PRESSURE,
+ CONDITION_CPU_PRESSURE,
+ CONDITION_IO_PRESSURE,
+
+ CONDITION_NEEDS_UPDATE,
+ CONDITION_FIRST_BOOT,
+
+ CONDITION_PATH_EXISTS,
+ CONDITION_PATH_EXISTS_GLOB,
+ CONDITION_PATH_IS_DIRECTORY,
+ CONDITION_PATH_IS_SYMBOLIC_LINK,
+ CONDITION_PATH_IS_MOUNT_POINT,
+ CONDITION_PATH_IS_READ_WRITE,
+ CONDITION_PATH_IS_ENCRYPTED,
+ CONDITION_DIRECTORY_NOT_EMPTY,
+ CONDITION_FILE_NOT_EMPTY,
+ CONDITION_FILE_IS_EXECUTABLE,
+
+ CONDITION_USER,
+ CONDITION_GROUP,
+
+ CONDITION_CONTROL_GROUP_CONTROLLER,
+
+ _CONDITION_TYPE_MAX,
+ _CONDITION_TYPE_INVALID = -EINVAL,
+} ConditionType;
+
+typedef enum ConditionResult {
+ CONDITION_UNTESTED,
+ CONDITION_SUCCEEDED,
+ CONDITION_FAILED,
+ CONDITION_ERROR,
+ _CONDITION_RESULT_MAX,
+ _CONDITION_RESULT_INVALID = -EINVAL,
+} ConditionResult;
+
+typedef struct Condition {
+ ConditionType type:8;
+
+ bool trigger:1;
+ bool negate:1;
+
+ ConditionResult result:6;
+
+ char *parameter;
+
+ LIST_FIELDS(struct Condition, conditions);
+} Condition;
+
+Condition* condition_new(ConditionType type, const char *parameter, bool trigger, bool negate);
+Condition* condition_free(Condition *c);
+Condition* condition_free_list_type(Condition *first, ConditionType type);
+static inline Condition* condition_free_list(Condition *first) {
+ return condition_free_list_type(first, _CONDITION_TYPE_INVALID);
+}
+
+int condition_test(Condition *c, char **env);
+
+typedef int (*condition_test_logger_t)(void *userdata, int level, int error, const char *file, int line, const char *func, const char *format, ...) _printf_(7, 8);
+typedef const char* (*condition_to_string_t)(ConditionType t) _const_;
+bool condition_test_list(Condition *first, char **env, condition_to_string_t to_string, condition_test_logger_t logger, void *userdata);
+
+void condition_dump(Condition *c, FILE *f, const char *prefix, condition_to_string_t to_string);
+void condition_dump_list(Condition *c, FILE *f, const char *prefix, condition_to_string_t to_string);
+
+const char* condition_type_to_string(ConditionType t) _const_;
+ConditionType condition_type_from_string(const char *s) _pure_;
+
+const char* assert_type_to_string(ConditionType t) _const_;
+ConditionType assert_type_from_string(const char *s) _pure_;
+
+const char* condition_result_to_string(ConditionResult r) _const_;
+ConditionResult condition_result_from_string(const char *s) _pure_;
+
+static inline bool condition_takes_path(ConditionType t) {
+ return IN_SET(t,
+ CONDITION_PATH_EXISTS,
+ CONDITION_PATH_EXISTS_GLOB,
+ CONDITION_PATH_IS_DIRECTORY,
+ CONDITION_PATH_IS_SYMBOLIC_LINK,
+ CONDITION_PATH_IS_MOUNT_POINT,
+ CONDITION_PATH_IS_READ_WRITE,
+ CONDITION_PATH_IS_ENCRYPTED,
+ CONDITION_DIRECTORY_NOT_EMPTY,
+ CONDITION_FILE_NOT_EMPTY,
+ CONDITION_FILE_IS_EXECUTABLE,
+ CONDITION_NEEDS_UPDATE);
+}
diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c
new file mode 100644
index 0000000..e8ecd9b
--- /dev/null
+++ b/src/shared/conf-parser.c
@@ -0,0 +1,1984 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include "alloc-util.h"
+#include "conf-files.h"
+#include "conf-parser.h"
+#include "constants.h"
+#include "dns-domain.h"
+#include "escape.h"
+#include "ether-addr-util.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hash-funcs.h"
+#include "hostname-util.h"
+#include "id128-util.h"
+#include "in-addr-util.h"
+#include "log.h"
+#include "macro.h"
+#include "missing_network.h"
+#include "nulstr-util.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "percent-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "sd-id128.h"
+#include "set.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "time-util.h"
+#include "utf8.h"
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(config_file_hash_ops_fclose,
+ char, path_hash_func, path_compare,
+ FILE, safe_fclose);
+
+int config_item_table_lookup(
+ const void *table,
+ const char *section,
+ const char *lvalue,
+ ConfigParserCallback *ret_func,
+ int *ret_ltype,
+ void **ret_data,
+ void *userdata) {
+
+ assert(table);
+ assert(lvalue);
+ assert(ret_func);
+ assert(ret_ltype);
+ assert(ret_data);
+
+ for (const ConfigTableItem *t = table; t->lvalue; t++) {
+
+ if (!streq(lvalue, t->lvalue))
+ continue;
+
+ if (!streq_ptr(section, t->section))
+ continue;
+
+ *ret_func = t->parse;
+ *ret_ltype = t->ltype;
+ *ret_data = t->data;
+ return 1;
+ }
+
+ *ret_func = NULL;
+ *ret_ltype = 0;
+ *ret_data = NULL;
+ return 0;
+}
+
+int config_item_perf_lookup(
+ const void *table,
+ const char *section,
+ const char *lvalue,
+ ConfigParserCallback *ret_func,
+ int *ret_ltype,
+ void **ret_data,
+ void *userdata) {
+
+ ConfigPerfItemLookup lookup = (ConfigPerfItemLookup) table;
+ const ConfigPerfItem *p;
+
+ assert(table);
+ assert(lvalue);
+ assert(ret_func);
+ assert(ret_ltype);
+ assert(ret_data);
+
+ if (section) {
+ const char *key;
+
+ key = strjoina(section, ".", lvalue);
+ p = lookup(key, strlen(key));
+ } else
+ p = lookup(lvalue, strlen(lvalue));
+ if (!p) {
+ *ret_func = NULL;
+ *ret_ltype = 0;
+ *ret_data = NULL;
+ return 0;
+ }
+
+ *ret_func = p->parse;
+ *ret_ltype = p->ltype;
+ *ret_data = (uint8_t*) userdata + p->offset;
+ return 1;
+}
+
+/* Run the user supplied parser for an assignment */
+static int next_assignment(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ ConfigItemLookup lookup,
+ const void *table,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ const char *rvalue,
+ ConfigParseFlags flags,
+ void *userdata) {
+
+ ConfigParserCallback func = NULL;
+ int ltype = 0;
+ void *data = NULL;
+ int r;
+
+ assert(filename);
+ assert(line > 0);
+ assert(lookup);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = lookup(table, section, lvalue, &func, &ltype, &data, userdata);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ if (!func)
+ return 0;
+
+ return func(unit, filename, line, section, section_line,
+ lvalue, ltype, rvalue, data, userdata);
+ }
+
+ /* Warn about unknown non-extension fields. */
+ if (!(flags & CONFIG_PARSE_RELAXED) && !startswith(lvalue, "X-"))
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unknown key name '%s' in section '%s', ignoring.", lvalue, section);
+
+ return 0;
+}
+
+/* Parse a single logical line */
+static int parse_line(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *sections,
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ char **section,
+ unsigned *section_line,
+ bool *section_ignored,
+ char *l, /* is modified */
+ void *userdata) {
+
+ char *e;
+
+ assert(filename);
+ assert(line > 0);
+ assert(lookup);
+ assert(l);
+
+ l = strstrip(l);
+ if (isempty(l))
+ return 0;
+
+ if (l[0] == '\n')
+ return 0;
+
+ if (!utf8_is_valid(l))
+ return log_syntax_invalid_utf8(unit, LOG_WARNING, filename, line, l);
+
+ if (l[0] == '[') {
+ _cleanup_free_ char *n = NULL;
+ size_t k;
+
+ k = strlen(l);
+ assert(k > 0);
+
+ if (l[k-1] != ']')
+ return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EBADMSG), "Invalid section header '%s'", l);
+
+ n = strndup(l+1, k-2);
+ if (!n)
+ return log_oom();
+
+ if (!string_is_safe(n))
+ return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EBADMSG), "Bad characters in section header '%s'", l);
+
+ if (sections && !nulstr_contains(sections, n)) {
+ bool ignore;
+
+ ignore = (flags & CONFIG_PARSE_RELAXED) || startswith(n, "X-");
+
+ if (!ignore)
+ NULSTR_FOREACH(t, sections)
+ if (streq_ptr(n, startswith(t, "-"))) { /* Ignore sections prefixed with "-" in valid section list */
+ ignore = true;
+ break;
+ }
+
+ if (!ignore)
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown section '%s'. Ignoring.", n);
+
+ *section = mfree(*section);
+ *section_line = 0;
+ *section_ignored = true;
+ } else {
+ free_and_replace(*section, n);
+ *section_line = line;
+ *section_ignored = false;
+ }
+
+ return 0;
+ }
+
+ if (sections && !*section) {
+ if (!(flags & CONFIG_PARSE_RELAXED) && !*section_ignored)
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Assignment outside of section. Ignoring.");
+
+ return 0;
+ }
+
+ e = strchr(l, '=');
+ if (!e)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Missing '=', ignoring line.");
+ if (e == l)
+ return log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Missing key name before '=', ignoring line.");
+
+ *e = 0;
+ e++;
+
+ return next_assignment(unit,
+ filename,
+ line,
+ lookup,
+ table,
+ *section,
+ *section_line,
+ strstrip(l),
+ strstrip(e),
+ flags,
+ userdata);
+}
+
+/* Go through the file and parse each line */
+int config_parse(
+ const char *unit,
+ const char *filename,
+ FILE *f,
+ const char *sections,
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata,
+ struct stat *ret_stat) {
+
+ _cleanup_free_ char *section = NULL, *continuation = NULL;
+ _cleanup_fclose_ FILE *ours = NULL;
+ unsigned line = 0, section_line = 0;
+ bool section_ignored = false, bom_seen = false;
+ struct stat st;
+ int r, fd;
+
+ assert(filename);
+ assert(lookup);
+
+ if (!f) {
+ f = ours = fopen(filename, "re");
+ if (!f) {
+ /* Only log on request, except for ENOENT,
+ * since we return 0 to the caller. */
+ if ((flags & CONFIG_PARSE_WARN) || errno == ENOENT)
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno,
+ "Failed to open configuration file '%s': %m", filename);
+
+ if (errno == ENOENT) {
+ if (ret_stat)
+ *ret_stat = (struct stat) {};
+
+ return 0;
+ }
+
+ return -errno;
+ }
+ }
+
+ fd = fileno(f);
+ if (fd >= 0) { /* stream might not have an fd, let's be careful hence */
+
+ if (fstat(fd, &st) < 0)
+ return log_full_errno(FLAGS_SET(flags, CONFIG_PARSE_WARN) ? LOG_ERR : LOG_DEBUG, errno,
+ "Failed to fstat(%s): %m", filename);
+
+ (void) stat_warn_permissions(filename, &st);
+ } else
+ st = (struct stat) {};
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ bool escaped = false;
+ char *l, *p, *e;
+
+ r = read_line(f, LONG_LINE_MAX, &buf);
+ if (r == 0)
+ break;
+ if (r == -ENOBUFS) {
+ if (flags & CONFIG_PARSE_WARN)
+ log_error_errno(r, "%s:%u: Line too long", filename, line);
+
+ return r;
+ }
+ if (r < 0) {
+ if (FLAGS_SET(flags, CONFIG_PARSE_WARN))
+ log_error_errno(r, "%s:%u: Error while reading configuration file: %m", filename, line);
+
+ return r;
+ }
+
+ line++;
+
+ l = skip_leading_chars(buf, WHITESPACE);
+ if (*l != '\0' && strchr(COMMENTS, *l))
+ continue;
+
+ l = buf;
+ if (!bom_seen) {
+ char *q;
+
+ q = startswith(buf, UTF8_BYTE_ORDER_MARK);
+ if (q) {
+ l = q;
+ bom_seen = true;
+ }
+ }
+
+ if (continuation) {
+ if (strlen(continuation) + strlen(l) > LONG_LINE_MAX) {
+ if (flags & CONFIG_PARSE_WARN)
+ log_error("%s:%u: Continuation line too long", filename, line);
+ return -ENOBUFS;
+ }
+
+ if (!strextend(&continuation, l)) {
+ if (flags & CONFIG_PARSE_WARN)
+ log_oom();
+ return -ENOMEM;
+ }
+
+ p = continuation;
+ } else
+ p = l;
+
+ for (e = p; *e; e++) {
+ if (escaped)
+ escaped = false;
+ else if (*e == '\\')
+ escaped = true;
+ }
+
+ if (escaped) {
+ *(e-1) = ' ';
+
+ if (!continuation) {
+ continuation = strdup(l);
+ if (!continuation) {
+ if (flags & CONFIG_PARSE_WARN)
+ log_oom();
+ return -ENOMEM;
+ }
+ }
+
+ continue;
+ }
+
+ r = parse_line(unit,
+ filename,
+ line,
+ sections,
+ lookup,
+ table,
+ flags,
+ &section,
+ &section_line,
+ &section_ignored,
+ p,
+ userdata);
+ if (r < 0) {
+ if (flags & CONFIG_PARSE_WARN)
+ log_warning_errno(r, "%s:%u: Failed to parse file: %m", filename, line);
+ return r;
+ }
+
+ continuation = mfree(continuation);
+ }
+
+ if (continuation) {
+ r = parse_line(unit,
+ filename,
+ ++line,
+ sections,
+ lookup,
+ table,
+ flags,
+ &section,
+ &section_line,
+ &section_ignored,
+ continuation,
+ userdata);
+ if (r < 0) {
+ if (flags & CONFIG_PARSE_WARN)
+ log_warning_errno(r, "%s:%u: Failed to parse file: %m", filename, line);
+ return r;
+ }
+ }
+
+ if (ret_stat)
+ *ret_stat = st;
+
+ return 1;
+}
+
+int hashmap_put_stats_by_path(Hashmap **stats_by_path, const char *path, const struct stat *st) {
+ _cleanup_free_ struct stat *st_copy = NULL;
+ _cleanup_free_ char *path_copy = NULL;
+ int r;
+
+ assert(stats_by_path);
+ assert(path);
+ assert(st);
+
+ r = hashmap_ensure_allocated(stats_by_path, &path_hash_ops_free_free);
+ if (r < 0)
+ return r;
+
+ st_copy = newdup(struct stat, st, 1);
+ if (!st_copy)
+ return -ENOMEM;
+
+ path_copy = strdup(path);
+ if (!path_copy)
+ return -ENOMEM;
+
+ r = hashmap_put(*stats_by_path, path_copy, st_copy);
+ if (r < 0)
+ return r;
+
+ assert(r > 0);
+ TAKE_PTR(path_copy);
+ TAKE_PTR(st_copy);
+ return 0;
+}
+
+static int config_parse_many_files(
+ const char* const* conf_files,
+ char **files,
+ const char *sections,
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata,
+ Hashmap **ret_stats_by_path) {
+
+ _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL;
+ _cleanup_ordered_hashmap_free_ OrderedHashmap *dropins = NULL;
+ _cleanup_set_free_ Set *inodes = NULL;
+ struct stat st;
+ int r;
+
+ if (ret_stats_by_path) {
+ stats_by_path = hashmap_new(&path_hash_ops_free_free);
+ if (!stats_by_path)
+ return -ENOMEM;
+ }
+
+ STRV_FOREACH(fn, files) {
+ _cleanup_free_ struct stat *st_dropin = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int fd;
+
+ f = fopen(*fn, "re");
+ if (!f) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ fd = fileno(f);
+
+ r = ordered_hashmap_ensure_put(&dropins, &config_file_hash_ops_fclose, *fn, f);
+ if (r < 0) {
+ assert(r != -EEXIST);
+ return r;
+ }
+ assert(r > 0);
+ TAKE_PTR(f);
+
+ /* Get inodes for all drop-ins. Later we'll verify if main config is a symlink to or is
+ * symlinked as one of them. If so, we skip reading main config file directly. */
+
+ st_dropin = new(struct stat, 1);
+ if (!st_dropin)
+ return -ENOMEM;
+
+ if (fstat(fd, st_dropin) < 0)
+ return -errno;
+
+ r = set_ensure_consume(&inodes, &inode_hash_ops, TAKE_PTR(st_dropin));
+ if (r < 0)
+ return r;
+ }
+
+ /* First read the first found main config file. */
+ STRV_FOREACH(fn, conf_files) {
+ _cleanup_fclose_ FILE *f = NULL;
+
+ f = fopen(*fn, "re");
+ if (!f) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ if (inodes) {
+ if (fstat(fileno(f), &st) < 0)
+ return -errno;
+
+ if (set_contains(inodes, &st)) {
+ log_debug("%s: symlink to/symlinked as drop-in, will be read later.", *fn);
+ break;
+ }
+ }
+
+ r = config_parse(NULL, *fn, f, sections, lookup, table, flags, userdata, &st);
+ if (r < 0)
+ return r;
+ assert(r > 0);
+
+ if (ret_stats_by_path) {
+ r = hashmap_put_stats_by_path(&stats_by_path, *fn, &st);
+ if (r < 0)
+ return r;
+ }
+
+ break;
+ }
+
+ /* Then read all the drop-ins. */
+
+ const char *path_dropin;
+ FILE *f_dropin;
+ ORDERED_HASHMAP_FOREACH_KEY(f_dropin, path_dropin, dropins) {
+ r = config_parse(NULL, path_dropin, f_dropin, sections, lookup, table, flags, userdata, &st);
+ if (r < 0)
+ return r;
+ assert(r > 0);
+
+ if (ret_stats_by_path) {
+ r = hashmap_put_stats_by_path(&stats_by_path, path_dropin, &st);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (ret_stats_by_path)
+ *ret_stats_by_path = TAKE_PTR(stats_by_path);
+
+ return 0;
+}
+
+/* Parse one main config file located in /etc/systemd and its drop-ins, which is what all systemd daemons
+ * do. */
+int config_parse_config_file(
+ const char *conf_file,
+ const char *sections,
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **dropins = NULL, **dropin_dirs = NULL;
+ char **conf_paths = CONF_PATHS_STRV("");
+ int r;
+
+ assert(conf_file);
+
+ /* build the dropin dir list */
+ dropin_dirs = new0(char*, strv_length(conf_paths) + 1);
+ if (!dropin_dirs) {
+ if (flags & CONFIG_PARSE_WARN)
+ return log_oom();
+ return -ENOMEM;
+ }
+
+ size_t i = 0;
+ STRV_FOREACH(p, conf_paths) {
+ char *d;
+
+ d = strjoin(*p, "systemd/", conf_file, ".d");
+ if (!d) {
+ if (flags & CONFIG_PARSE_WARN)
+ return log_oom();
+ return -ENOMEM;
+ }
+
+ dropin_dirs[i++] = d;
+ }
+
+ r = conf_files_list_strv(&dropins, ".conf", NULL, 0, (const char**) dropin_dirs);
+ if (r < 0)
+ return r;
+
+ const char *sysconf_file = strjoina(PKGSYSCONFDIR, "/", conf_file);
+
+ return config_parse_many_files(STRV_MAKE_CONST(sysconf_file), dropins,
+ sections, lookup, table, flags, userdata, NULL);
+}
+
+/* Parse each config file in the directories specified as strv. */
+int config_parse_many(
+ const char* const* conf_files,
+ const char* const* conf_file_dirs,
+ const char *dropin_dirname,
+ const char *root,
+ const char *sections,
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata,
+ Hashmap **ret_stats_by_path,
+ char ***ret_dropin_files) {
+
+ _cleanup_strv_free_ char **files = NULL;
+ int r;
+
+ assert(conf_file_dirs);
+ assert(dropin_dirname);
+ assert(sections);
+ assert(table);
+
+ r = conf_files_list_dropins(&files, dropin_dirname, root, conf_file_dirs);
+ if (r < 0)
+ return r;
+
+ r = config_parse_many_files(conf_files, files, sections, lookup, table, flags, userdata, ret_stats_by_path);
+ if (r < 0)
+ return r;
+
+ if (ret_dropin_files)
+ *ret_dropin_files = TAKE_PTR(files);
+
+ return 0;
+}
+
+static int dropins_get_stats_by_path(
+ const char* conf_file,
+ const char* const* conf_file_dirs,
+ Hashmap **stats_by_path) {
+
+ _cleanup_strv_free_ char **files = NULL;
+ _cleanup_free_ char *dropin_dirname = NULL;
+ int r;
+
+ assert(conf_file);
+ assert(conf_file_dirs);
+ assert(stats_by_path);
+
+ r = path_extract_filename(conf_file, &dropin_dirname);
+ if (r < 0)
+ return r;
+ if (r == O_DIRECTORY)
+ return -EINVAL;
+
+ if (!strextend(&dropin_dirname, ".d"))
+ return -ENOMEM;
+
+ r = conf_files_list_dropins(&files, dropin_dirname, /* root = */ NULL, conf_file_dirs);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(fn, files) {
+ struct stat st;
+
+ if (stat(*fn, &st) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ r = hashmap_put_stats_by_path(stats_by_path, *fn, &st);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int config_get_stats_by_path(
+ const char *suffix,
+ const char *root,
+ unsigned flags,
+ const char* const* dirs,
+ bool check_dropins,
+ Hashmap **ret) {
+
+ _cleanup_hashmap_free_ Hashmap *stats_by_path = NULL;
+ _cleanup_strv_free_ char **files = NULL;
+ int r;
+
+ assert(suffix);
+ assert(dirs);
+ assert(ret);
+
+ /* Unlike config_parse(), this does not support stream. */
+
+ r = conf_files_list_strv(&files, suffix, root, flags, dirs);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(f, files) {
+ struct stat st;
+
+ /* First read the main config file. */
+ if (stat(*f, &st) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ r = hashmap_put_stats_by_path(&stats_by_path, *f, &st);
+ if (r < 0)
+ return r;
+
+ if (!check_dropins)
+ continue;
+
+ /* Then read all the drop-ins if requested. */
+ r = dropins_get_stats_by_path(*f, dirs, &stats_by_path);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(stats_by_path);
+ return 0;
+}
+
+bool stats_by_path_equal(Hashmap *a, Hashmap *b) {
+ struct stat *st_a, *st_b;
+ const char *path;
+
+ if (hashmap_size(a) != hashmap_size(b))
+ return false;
+
+ HASHMAP_FOREACH_KEY(st_a, path, a) {
+ st_b = hashmap_get(b, path);
+ if (!st_b)
+ return false;
+
+ if (!stat_inode_unmodified(st_a, st_b))
+ return false;
+ }
+
+ return true;
+}
+
+static void config_section_hash_func(const ConfigSection *c, struct siphash *state) {
+ siphash24_compress_string(c->filename, state);
+ siphash24_compress(&c->line, sizeof(c->line), state);
+}
+
+static int config_section_compare_func(const ConfigSection *x, const ConfigSection *y) {
+ int r;
+
+ r = strcmp(x->filename, y->filename);
+ if (r != 0)
+ return r;
+
+ return CMP(x->line, y->line);
+}
+
+DEFINE_HASH_OPS(config_section_hash_ops, ConfigSection, config_section_hash_func, config_section_compare_func);
+
+int config_section_new(const char *filename, unsigned line, ConfigSection **ret) {
+ ConfigSection *cs;
+
+ assert(filename);
+ assert(line > 0);
+ assert(ret);
+
+ cs = malloc0(offsetof(ConfigSection, filename) + strlen(filename) + 1);
+ if (!cs)
+ return -ENOMEM;
+
+ strcpy(cs->filename, filename);
+ cs->line = line;
+
+ *ret = TAKE_PTR(cs);
+ return 0;
+}
+
+int _hashmap_by_section_find_unused_line(
+ HashmapBase *entries_by_section,
+ const char *filename,
+ unsigned *ret) {
+
+ ConfigSection *cs;
+ unsigned n = 0;
+ void *entry;
+
+ HASHMAP_BASE_FOREACH_KEY(entry, cs, entries_by_section) {
+ if (filename && !streq(cs->filename, filename))
+ continue;
+ n = MAX(n, cs->line);
+ }
+
+ /* overflow? */
+ if (n >= UINT_MAX)
+ return -EFBIG;
+
+ *ret = n + 1;
+ return 0;
+}
+
+#define DEFINE_PARSER(type, vartype, conv_func) \
+ DEFINE_CONFIG_PARSE_PTR(config_parse_##type, conv_func, vartype, "Failed to parse " #type " value")
+
+DEFINE_PARSER(int, int, safe_atoi);
+DEFINE_PARSER(long, long, safe_atoli);
+DEFINE_PARSER(uint8, uint8_t, safe_atou8);
+DEFINE_PARSER(uint16, uint16_t, safe_atou16);
+DEFINE_PARSER(uint32, uint32_t, safe_atou32);
+DEFINE_PARSER(int32, int32_t, safe_atoi32);
+DEFINE_PARSER(uint64, uint64_t, safe_atou64);
+DEFINE_PARSER(unsigned, unsigned, safe_atou);
+DEFINE_PARSER(double, double, safe_atod);
+DEFINE_PARSER(nsec, nsec_t, parse_nsec);
+DEFINE_PARSER(sec, usec_t, parse_sec);
+DEFINE_PARSER(sec_def_infinity, usec_t, parse_sec_def_infinity);
+DEFINE_PARSER(mode, mode_t, parse_mode);
+DEFINE_PARSER(pid, pid_t, parse_pid);
+
+int config_parse_iec_size(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ size_t *sz = ASSERT_PTR(data);
+ uint64_t v;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_size(rvalue, 1024, &v);
+ if (r >= 0 && (uint64_t) (size_t) v != v)
+ r = -ERANGE;
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ *sz = (size_t) v;
+ return 0;
+}
+
+int config_parse_si_uint64(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint64_t *sz = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_size(rvalue, 1000, sz);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value '%s', ignoring: %m", rvalue);
+
+ return 0;
+}
+
+int config_parse_iec_uint64(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint64_t *bytes = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_size(rvalue, 1024, bytes);
+ if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse size value, ignoring: %s", rvalue);
+
+ return 0;
+}
+
+int config_parse_iec_uint64_infinity(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint64_t *bytes = ASSERT_PTR(data);
+
+ assert(rvalue);
+
+ if (streq(rvalue, "infinity")) {
+ *bytes = UINT64_MAX;
+ return 0;
+ }
+
+ return config_parse_iec_uint64(unit, filename, line, section, section_line, lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_bool(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int k;
+ bool *b = ASSERT_PTR(data);
+ bool fatal = ltype;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ k = parse_boolean(rvalue);
+ if (k < 0) {
+ log_syntax(unit, fatal ? LOG_ERR : LOG_WARNING, filename, line, k,
+ "Failed to parse boolean value%s: %s",
+ fatal ? "" : ", ignoring", rvalue);
+ return fatal ? -ENOEXEC : 0;
+ }
+
+ *b = k;
+ return 0;
+}
+
+int config_parse_id128(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ sd_id128_t *result = data;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = id128_from_string_nonzero(rvalue, result);
+ if (r == -ENXIO)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "128-bit ID/UUID is all 0, ignoring: %s", rvalue);
+ else if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse 128-bit ID/UUID, ignoring: %s", rvalue);
+
+ return 0;
+}
+
+int config_parse_tristate(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int r, *t = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ /* A tristate is pretty much a boolean, except that it can also take an empty string,
+ * indicating "uninitialized", much like NULL is for a pointer type. */
+
+ if (isempty(rvalue)) {
+ *t = -1;
+ return 0;
+ }
+
+ r = parse_tristate(rvalue, t);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse boolean value for %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_string(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char **s = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (FLAGS_SET(ltype, CONFIG_PARSE_STRING_SAFE) && !string_is_safe(rvalue)) {
+ _cleanup_free_ char *escaped = NULL;
+
+ escaped = cescape(rvalue);
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Specified string contains unsafe characters, ignoring: %s", strna(escaped));
+ return 0;
+ }
+
+ if (FLAGS_SET(ltype, CONFIG_PARSE_STRING_ASCII) && !ascii_is_valid(rvalue)) {
+ _cleanup_free_ char *escaped = NULL;
+
+ escaped = cescape(rvalue);
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Specified string contains invalid ASCII characters, ignoring: %s", strna(escaped));
+ return 0;
+ }
+
+ return free_and_strdup_warn(s, empty_to_null(rvalue));
+}
+
+int config_parse_dns_name(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char **hostname = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *hostname = mfree(*hostname);
+ return 0;
+ }
+
+ r = dns_name_is_valid(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to check validity of DNS domain name '%s', ignoring assignment: %m", rvalue);
+ return 0;
+ }
+ if (r == 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Specified invalid DNS domain name, ignoring assignment: %s", rvalue);
+ return 0;
+ }
+
+ return free_and_strdup_warn(hostname, rvalue);
+}
+
+int config_parse_hostname(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char **hostname = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *hostname = mfree(*hostname);
+ return 0;
+ }
+
+ if (!hostname_is_valid(rvalue, 0)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Specified invalid hostname, ignoring assignment: %s", rvalue);
+ return 0;
+ }
+
+ return config_parse_dns_name(unit, filename, line, section, section_line,
+ lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_path(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *n = NULL;
+ bool fatal = ltype;
+ char **s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue))
+ goto finalize;
+
+ n = strdup(rvalue);
+ if (!n)
+ return log_oom();
+
+ r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE | (fatal ? PATH_CHECK_FATAL : 0), unit, filename, line, lvalue);
+ if (r < 0)
+ return fatal ? -ENOEXEC : 0;
+
+finalize:
+ return free_and_replace(*s, n);
+}
+
+int config_parse_strv(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char ***sv = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ r = strv_consume(sv, word);
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_warn_compat(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Disabled reason = ltype;
+
+ switch (reason) {
+
+ case DISABLED_CONFIGURATION:
+ log_syntax(unit, LOG_DEBUG, filename, line, 0,
+ "Support for option %s= has been disabled at compile time and it is ignored", lvalue);
+ break;
+
+ case DISABLED_LEGACY:
+ log_syntax(unit, LOG_INFO, filename, line, 0,
+ "Support for option %s= has been removed and it is ignored", lvalue);
+ break;
+
+ case DISABLED_EXPERIMENTAL:
+ log_syntax(unit, LOG_INFO, filename, line, 0,
+ "Support for option %s= has not yet been enabled and it is ignored", lvalue);
+ break;
+ }
+
+ return 0;
+}
+
+int config_parse_log_facility(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *o = data, x;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ x = log_facility_unshifted_from_string(rvalue);
+ if (x < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse log facility, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ *o = (x << 3) | LOG_PRI(*o);
+
+ return 0;
+}
+
+int config_parse_log_level(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *o = data, x;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ x = log_level_from_string(rvalue);
+ if (x < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, x, "Failed to parse log level, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (*o < 0) /* if it wasn't initialized so far, assume zero facility */
+ *o = x;
+ else
+ *o = (*o & LOG_FACMASK) | x;
+
+ return 0;
+}
+
+int config_parse_signal(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *sig = data, r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(sig);
+
+ r = signal_from_string(rvalue);
+ if (r <= 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse signal name, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ *sig = r;
+ return 0;
+}
+
+int config_parse_personality(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ unsigned long *personality = data, p;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(personality);
+
+ if (isempty(rvalue))
+ p = PERSONALITY_INVALID;
+ else {
+ p = personality_from_string(rvalue);
+ if (p == PERSONALITY_INVALID) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse personality, ignoring: %s", rvalue);
+ return 0;
+ }
+ }
+
+ *personality = p;
+ return 0;
+}
+
+int config_parse_ifname(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ char **s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!ifname_valid(rvalue)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Interface name is not valid or too long, ignoring assignment: %s", rvalue);
+ return 0;
+ }
+
+ r = free_and_strdup(s, rvalue);
+ if (r < 0)
+ return log_oom();
+
+ return 0;
+}
+
+int config_parse_ifnames(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **names = NULL;
+ char ***s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *s = strv_free(*s);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to extract interface name, ignoring assignment: %s",
+ rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ if (!ifname_valid_full(word, ltype)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Interface name is not valid or too long, ignoring assignment: %s",
+ word);
+ continue;
+ }
+
+ r = strv_consume(&names, TAKE_PTR(word));
+ if (r < 0)
+ return log_oom();
+ }
+
+ r = strv_extend_strv(s, names, true);
+ if (r < 0)
+ return log_oom();
+
+ return 0;
+}
+
+int config_parse_ip_port(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint16_t *s = ASSERT_PTR(data);
+ uint16_t port;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *s = 0;
+ return 0;
+ }
+
+ r = parse_ip_port(rvalue, &port);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse port '%s'.", rvalue);
+ return 0;
+ }
+
+ *s = port;
+
+ return 0;
+}
+
+int config_parse_mtu(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint32_t *mtu = ASSERT_PTR(data);
+ int r;
+
+ assert(rvalue);
+
+ r = parse_mtu(ltype, rvalue, mtu);
+ if (r == -ERANGE) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Maximum transfer unit (MTU) value out of range. Permitted range is %" PRIu32 "…%" PRIu32 ", ignoring: %s",
+ (uint32_t) (ltype == AF_INET6 ? IPV6_MIN_MTU : IPV4_MIN_MTU), (uint32_t) UINT32_MAX,
+ rvalue);
+ return 0;
+ }
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse MTU value '%s', ignoring: %m", rvalue);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_rlimit(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ struct rlimit **rl = data, d = {};
+ int r;
+
+ assert(rvalue);
+ assert(rl);
+
+ r = rlimit_parse(ltype, rvalue, &d);
+ if (r == -EILSEQ) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Soft resource limit chosen higher than hard limit, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse resource value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (rl[ltype])
+ *rl[ltype] = d;
+ else {
+ rl[ltype] = newdup(struct rlimit, &d, 1);
+ if (!rl[ltype])
+ return log_oom();
+ }
+
+ return 0;
+}
+
+int config_parse_permille(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ unsigned *permille = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_permille(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse permille value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ *permille = (unsigned) r;
+
+ return 0;
+}
+
+int config_parse_vlanprotocol(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int *vlan_protocol = data;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ *vlan_protocol = -1;
+ return 0;
+ }
+
+ if (STR_IN_SET(rvalue, "802.1ad", "802.1AD"))
+ *vlan_protocol = ETH_P_8021AD;
+ else if (STR_IN_SET(rvalue, "802.1q", "802.1Q"))
+ *vlan_protocol = ETH_P_8021Q;
+ else {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Failed to parse VLAN protocol value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return 0;
+}
+
+int config_parse_hw_addr(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ struct hw_addr_data a, *hwaddr = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *hwaddr = HW_ADDR_NULL;
+ return 0;
+ }
+
+ r = parse_hw_addr_full(rvalue, ltype, &a);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Not a valid hardware address, ignoring assignment: %s", rvalue);
+ return 0;
+ }
+
+ *hwaddr = a;
+ return 0;
+}
+
+int config_parse_hw_addrs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Set **hwaddrs = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *hwaddrs = set_free(*hwaddrs);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ _cleanup_free_ struct hw_addr_data *n = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ n = new(struct hw_addr_data, 1);
+ if (!n)
+ return log_oom();
+
+ r = parse_hw_addr_full(word, ltype, n);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Not a valid hardware address, ignoring: %s", word);
+ continue;
+ }
+
+ r = set_ensure_consume(hwaddrs, &hw_addr_hash_ops_free, TAKE_PTR(n));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_ether_addr(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ struct ether_addr *n = NULL;
+ struct ether_addr **hwaddr = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *hwaddr = mfree(*hwaddr);
+ return 0;
+ }
+
+ n = new0(struct ether_addr, 1);
+ if (!n)
+ return log_oom();
+
+ r = parse_ether_addr(rvalue, n);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Not a valid MAC address, ignoring assignment: %s", rvalue);
+ return 0;
+ }
+
+ free_and_replace(*hwaddr, n);
+
+ return 0;
+}
+
+int config_parse_ether_addrs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Set **hwaddrs = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty assignment resets the list */
+ *hwaddrs = set_free(*hwaddrs);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+ _cleanup_free_ struct ether_addr *n = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ n = new(struct ether_addr, 1);
+ if (!n)
+ return log_oom();
+
+ r = parse_ether_addr(word, n);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Not a valid MAC address, ignoring: %s", word);
+ continue;
+ }
+
+ r = set_ensure_consume(hwaddrs, &ether_addr_hash_ops_free, TAKE_PTR(n));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_in_addr_non_null(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ /* data must be a pointer to struct in_addr or in6_addr, and the type is determined by ltype. */
+ struct in_addr *ipv4 = ASSERT_PTR(data);
+ struct in6_addr *ipv6 = ASSERT_PTR(data);
+ union in_addr_union a;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(IN_SET(ltype, AF_INET, AF_INET6));
+
+ if (isempty(rvalue)) {
+ if (ltype == AF_INET)
+ *ipv4 = (struct in_addr) {};
+ else
+ *ipv6 = (struct in6_addr) {};
+ return 0;
+ }
+
+ r = in_addr_from_string(ltype, rvalue, &a);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (!in_addr_is_set(ltype, &a)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "%s= cannot be the ANY address, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (ltype == AF_INET)
+ *ipv4 = a.in;
+ else
+ *ipv6 = a.in6;
+ return 0;
+}
+
+int config_parse_unsigned_bounded(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *name,
+ const char *value,
+ unsigned min,
+ unsigned max,
+ bool ignoring,
+ unsigned *ret) {
+
+ int r;
+
+ assert(filename);
+ assert(name);
+ assert(value);
+ assert(ret);
+
+ r = safe_atou_bounded(value, min, max, ret);
+ if (r == -ERANGE)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid '%s=%s', allowed range is %u..%u%s.",
+ name, value, min, max, ignoring ? ", ignoring" : "");
+ else if (r < 0)
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse '%s=%s'%s: %m",
+ name, value, ignoring ? ", ignoring" : "");
+
+ if (r >= 0)
+ return 1; /* Return 1 if something was set */
+ else if (ignoring)
+ return 0;
+ else
+ return r;
+}
+
+DEFINE_CONFIG_PARSE(config_parse_percent, parse_percent, "Failed to parse percent value");
+DEFINE_CONFIG_PARSE(config_parse_permyriad, parse_permyriad, "Failed to parse permyriad value");
+DEFINE_CONFIG_PARSE_PTR(config_parse_sec_fix_0, parse_sec_fix_0, usec_t, "Failed to parse time value");
diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h
new file mode 100644
index 0000000..a1768cd
--- /dev/null
+++ b/src/shared/conf-parser.h
@@ -0,0 +1,481 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <syslog.h>
+#include <sys/stat.h>
+
+#include "alloc-util.h"
+#include "hashmap.h"
+#include "log.h"
+#include "macro.h"
+#include "time-util.h"
+
+/* An abstract parser for simple, line based, shallow configuration files consisting of variable assignments only. */
+
+typedef enum ConfigParseFlags {
+ CONFIG_PARSE_RELAXED = 1 << 0, /* Do not warn about unknown non-extension fields */
+ CONFIG_PARSE_WARN = 1 << 1, /* Emit non-debug messages */
+} ConfigParseFlags;
+
+/* Argument list for parsers of specific configuration settings. */
+#define CONFIG_PARSER_ARGUMENTS \
+ const char *unit, \
+ const char *filename, \
+ unsigned line, \
+ const char *section, \
+ unsigned section_line, \
+ const char *lvalue, \
+ int ltype, \
+ const char *rvalue, \
+ void *data, \
+ void *userdata
+
+/* Prototype for a parser for a specific configuration setting */
+typedef int (*ConfigParserCallback)(CONFIG_PARSER_ARGUMENTS);
+
+/* A macro declaring a function prototype, following the typedef above, simply because it's so cumbersomely long
+ * otherwise. (And current emacs gets irritatingly slow when editing files that contain lots of very long function
+ * prototypes on the same screen…) */
+#define CONFIG_PARSER_PROTOTYPE(name) int name(CONFIG_PARSER_ARGUMENTS)
+
+/* Wraps information for parsing a specific configuration variable, to
+ * be stored in a simple array */
+typedef struct ConfigTableItem {
+ const char *section; /* Section */
+ const char *lvalue; /* Name of the variable */
+ ConfigParserCallback parse; /* Function that is called to parse the variable's value */
+ int ltype; /* Distinguish different variables passed to the same callback */
+ void *data; /* Where to store the variable's data */
+} ConfigTableItem;
+
+/* Wraps information for parsing a specific configuration variable, to
+ * be stored in a gperf perfect hashtable */
+typedef struct ConfigPerfItem {
+ const char *section_and_lvalue; /* Section + "." + name of the variable */
+ ConfigParserCallback parse; /* Function that is called to parse the variable's value */
+ int ltype; /* Distinguish different variables passed to the same callback */
+ size_t offset; /* Offset where to store data, from the beginning of userdata */
+} ConfigPerfItem;
+
+/* Prototype for a low-level gperf lookup function */
+typedef const ConfigPerfItem* (*ConfigPerfItemLookup)(const char *section_and_lvalue, GPERF_LEN_TYPE length);
+
+/* Prototype for a generic high-level lookup function */
+typedef int (*ConfigItemLookup)(
+ const void *table,
+ const char *section,
+ const char *lvalue,
+ ConfigParserCallback *ret_func,
+ int *ret_ltype,
+ void **ret_data,
+ void *userdata);
+
+/* Linear table search implementation of ConfigItemLookup, based on
+ * ConfigTableItem arrays */
+int config_item_table_lookup(const void *table, const char *section, const char *lvalue, ConfigParserCallback *ret_func, int *ret_ltype, void **ret_data, void *userdata);
+
+/* gperf implementation of ConfigItemLookup, based on gperf
+ * ConfigPerfItem tables */
+int config_item_perf_lookup(const void *table, const char *section, const char *lvalue, ConfigParserCallback *ret_func, int *ret_ltype, void **ret_data, void *userdata);
+
+int config_parse(
+ const char *unit,
+ const char *filename,
+ FILE *f,
+ const char *sections, /* nulstr */
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata,
+ struct stat *ret_stat); /* possibly NULL */
+
+int config_parse_config_file(
+ const char *conf_file,
+ const char *sections, /* nulstr */
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata);
+
+int config_parse_many(
+ const char* const* conf_files, /* possibly empty */
+ const char* const* conf_file_dirs,
+ const char *dropin_dirname,
+ const char *root,
+ const char *sections, /* nulstr */
+ ConfigItemLookup lookup,
+ const void *table,
+ ConfigParseFlags flags,
+ void *userdata,
+ Hashmap **ret_stats_by_path, /* possibly NULL */
+ char ***ret_drop_in_files); /* possibly NULL */
+
+int config_get_stats_by_path(
+ const char *suffix,
+ const char *root,
+ unsigned flags,
+ const char* const* dirs,
+ bool check_dropins,
+ Hashmap **ret);
+
+int hashmap_put_stats_by_path(Hashmap **stats_by_path, const char *path, const struct stat *st);
+bool stats_by_path_equal(Hashmap *a, Hashmap *b);
+
+typedef struct ConfigSection {
+ unsigned line;
+ bool invalid;
+ char filename[];
+} ConfigSection;
+
+static inline ConfigSection* config_section_free(ConfigSection *cs) {
+ return mfree(cs);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(ConfigSection*, config_section_free);
+
+int config_section_new(const char *filename, unsigned line, ConfigSection **ret);
+extern const struct hash_ops config_section_hash_ops;
+int _hashmap_by_section_find_unused_line(
+ HashmapBase *entries_by_section,
+ const char *filename,
+ unsigned *ret);
+static inline int hashmap_by_section_find_unused_line(
+ Hashmap *entries_by_section,
+ const char *filename,
+ unsigned *ret) {
+ return _hashmap_by_section_find_unused_line(HASHMAP_BASE(entries_by_section), filename, ret);
+}
+static inline int ordered_hashmap_by_section_find_unused_line(
+ OrderedHashmap *entries_by_section,
+ const char *filename,
+ unsigned *ret) {
+ return _hashmap_by_section_find_unused_line(HASHMAP_BASE(entries_by_section), filename, ret);
+}
+
+static inline bool section_is_invalid(ConfigSection *section) {
+ /* If this returns false, then it does _not_ mean the section is valid. */
+
+ if (!section)
+ return false;
+
+ return section->invalid;
+}
+
+#define DEFINE_SECTION_CLEANUP_FUNCTIONS(type, free_func) \
+ static inline type* free_func##_or_set_invalid(type *p) { \
+ assert(p); \
+ \
+ if (p->section) \
+ p->section->invalid = true; \
+ else \
+ free_func(p); \
+ return NULL; \
+ } \
+ DEFINE_TRIVIAL_CLEANUP_FUNC(type*, free_func); \
+ DEFINE_TRIVIAL_CLEANUP_FUNC(type*, free_func##_or_set_invalid);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_int);
+CONFIG_PARSER_PROTOTYPE(config_parse_unsigned);
+CONFIG_PARSER_PROTOTYPE(config_parse_long);
+CONFIG_PARSER_PROTOTYPE(config_parse_uint8);
+CONFIG_PARSER_PROTOTYPE(config_parse_uint16);
+CONFIG_PARSER_PROTOTYPE(config_parse_uint32);
+CONFIG_PARSER_PROTOTYPE(config_parse_int32);
+CONFIG_PARSER_PROTOTYPE(config_parse_uint64);
+CONFIG_PARSER_PROTOTYPE(config_parse_double);
+CONFIG_PARSER_PROTOTYPE(config_parse_iec_size);
+CONFIG_PARSER_PROTOTYPE(config_parse_si_uint64);
+CONFIG_PARSER_PROTOTYPE(config_parse_iec_uint64);
+CONFIG_PARSER_PROTOTYPE(config_parse_iec_uint64_infinity);
+CONFIG_PARSER_PROTOTYPE(config_parse_bool);
+CONFIG_PARSER_PROTOTYPE(config_parse_id128);
+CONFIG_PARSER_PROTOTYPE(config_parse_tristate);
+CONFIG_PARSER_PROTOTYPE(config_parse_string);
+CONFIG_PARSER_PROTOTYPE(config_parse_dns_name);
+CONFIG_PARSER_PROTOTYPE(config_parse_hostname);
+CONFIG_PARSER_PROTOTYPE(config_parse_path);
+CONFIG_PARSER_PROTOTYPE(config_parse_strv);
+CONFIG_PARSER_PROTOTYPE(config_parse_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_sec_def_infinity);
+CONFIG_PARSER_PROTOTYPE(config_parse_sec_def_unset);
+CONFIG_PARSER_PROTOTYPE(config_parse_nsec);
+CONFIG_PARSER_PROTOTYPE(config_parse_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_warn_compat);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_facility);
+CONFIG_PARSER_PROTOTYPE(config_parse_log_level);
+CONFIG_PARSER_PROTOTYPE(config_parse_signal);
+CONFIG_PARSER_PROTOTYPE(config_parse_personality);
+CONFIG_PARSER_PROTOTYPE(config_parse_permille);
+CONFIG_PARSER_PROTOTYPE(config_parse_ifname);
+CONFIG_PARSER_PROTOTYPE(config_parse_ifnames);
+CONFIG_PARSER_PROTOTYPE(config_parse_ip_port);
+CONFIG_PARSER_PROTOTYPE(config_parse_mtu);
+CONFIG_PARSER_PROTOTYPE(config_parse_rlimit);
+CONFIG_PARSER_PROTOTYPE(config_parse_vlanprotocol);
+CONFIG_PARSER_PROTOTYPE(config_parse_hw_addr);
+CONFIG_PARSER_PROTOTYPE(config_parse_hw_addrs);
+CONFIG_PARSER_PROTOTYPE(config_parse_ether_addr);
+CONFIG_PARSER_PROTOTYPE(config_parse_ether_addrs);
+CONFIG_PARSER_PROTOTYPE(config_parse_in_addr_non_null);
+CONFIG_PARSER_PROTOTYPE(config_parse_percent);
+CONFIG_PARSER_PROTOTYPE(config_parse_permyriad);
+CONFIG_PARSER_PROTOTYPE(config_parse_pid);
+CONFIG_PARSER_PROTOTYPE(config_parse_sec_fix_0);
+
+typedef enum Disabled {
+ DISABLED_CONFIGURATION,
+ DISABLED_LEGACY,
+ DISABLED_EXPERIMENTAL,
+} Disabled;
+
+typedef enum ConfigParseStringFlags {
+ CONFIG_PARSE_STRING_SAFE = 1 << 0,
+ CONFIG_PARSE_STRING_ASCII = 1 << 1,
+
+ CONFIG_PARSE_STRING_SAFE_AND_ASCII = CONFIG_PARSE_STRING_SAFE | CONFIG_PARSE_STRING_ASCII,
+} ConfigParseStringFlags;
+
+#define DEFINE_CONFIG_PARSE(function, parser, msg) \
+ CONFIG_PARSER_PROTOTYPE(function) { \
+ int *i = data, r; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ assert(data); \
+ \
+ r = parser(rvalue); \
+ if (r < 0) { \
+ log_syntax(unit, LOG_WARNING, filename, line, r, \
+ msg ", ignoring: %s", rvalue); \
+ return 0; \
+ } \
+ \
+ *i = r; \
+ return 0; \
+ }
+
+#define DEFINE_CONFIG_PARSE_PTR(function, parser, type, msg) \
+ CONFIG_PARSER_PROTOTYPE(function) { \
+ type *i = ASSERT_PTR(data); \
+ int r; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ \
+ r = parser(rvalue, i); \
+ if (r < 0) \
+ log_syntax(unit, LOG_WARNING, filename, line, r, \
+ msg ", ignoring: %s", rvalue); \
+ \
+ return 0; \
+ }
+
+#define DEFINE_CONFIG_PARSE_ENUM_FULL(function, from_string, type, msg) \
+ CONFIG_PARSER_PROTOTYPE(function) { \
+ type *i = data, x; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ assert(data); \
+ \
+ x = from_string(rvalue); \
+ if (x < 0) { \
+ log_syntax(unit, LOG_WARNING, filename, line, x, \
+ msg ", ignoring: %s", rvalue); \
+ return 0; \
+ } \
+ \
+ *i = x; \
+ return 0; \
+ }
+
+#define DEFINE_CONFIG_PARSE_ENUM(function, name, type, msg) \
+ DEFINE_CONFIG_PARSE_ENUM_FULL(function, name##_from_string, type, msg)
+
+#define DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(function, name, type, default_value, msg) \
+ CONFIG_PARSER_PROTOTYPE(function) { \
+ type *i = data, x; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ assert(data); \
+ \
+ if (isempty(rvalue)) { \
+ *i = default_value; \
+ return 0; \
+ } \
+ \
+ x = name##_from_string(rvalue); \
+ if (x < 0) { \
+ log_syntax(unit, LOG_WARNING, filename, line, x, \
+ msg ", ignoring: %s", rvalue); \
+ return 0; \
+ } \
+ \
+ *i = x; \
+ return 0; \
+ }
+
+#define DEFINE_CONFIG_PARSE_ENUMV(function, name, type, invalid, msg) \
+ CONFIG_PARSER_PROTOTYPE(function) { \
+ type **enums = ASSERT_PTR(data); \
+ _cleanup_free_ type *xs = NULL; \
+ size_t i = 0; \
+ int r; \
+ \
+ assert(filename); \
+ assert(lvalue); \
+ assert(rvalue); \
+ \
+ xs = new0(type, 1); \
+ if (!xs) \
+ return -ENOMEM; \
+ \
+ *xs = invalid; \
+ \
+ for (const char *p = rvalue;;) { \
+ _cleanup_free_ char *en = NULL; \
+ type x, *new_xs; \
+ \
+ r = extract_first_word(&p, &en, NULL, 0); \
+ if (r == -ENOMEM) \
+ return log_oom(); \
+ if (r < 0) { \
+ log_syntax(unit, LOG_WARNING, filename, line, r, \
+ msg ", ignoring: %s", en); \
+ return 0; \
+ } \
+ if (r == 0) \
+ break; \
+ \
+ x = name##_from_string(en); \
+ if (x < 0) { \
+ log_syntax(unit, LOG_WARNING, filename, line, x, \
+ msg ", ignoring: %s", en); \
+ continue; \
+ } \
+ \
+ for (type *ys = xs; x != invalid && *ys != invalid; ys++) \
+ if (*ys == x) { \
+ log_syntax(unit, LOG_NOTICE, filename, line, 0, \
+ "Duplicate entry, ignoring: %s", \
+ en); \
+ x = invalid; \
+ } \
+ \
+ if (x == invalid) \
+ continue; \
+ \
+ *(xs + i) = x; \
+ new_xs = realloc(xs, (++i + 1) * sizeof(type)); \
+ if (new_xs) \
+ xs = new_xs; \
+ else \
+ return log_oom(); \
+ \
+ *(xs + i) = invalid; \
+ } \
+ \
+ return free_and_replace(*enums, xs); \
+ }
+
+int config_parse_unsigned_bounded(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *name,
+ const char *value,
+ unsigned min,
+ unsigned max,
+ bool ignoring,
+ unsigned *ret);
+
+static inline int config_parse_uint32_bounded(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *name,
+ const char *value,
+ uint32_t min,
+ uint32_t max,
+ bool ignoring,
+ uint32_t *ret) {
+
+ unsigned t;
+ int r;
+
+ r = config_parse_unsigned_bounded(
+ unit, filename, line, section, section_line, name, value,
+ min, max, ignoring,
+ &t);
+ if (r <= 0)
+ return r;
+ assert(t <= UINT32_MAX);
+ *ret = t;
+ return 1;
+}
+
+static inline int config_parse_uint16_bounded(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *name,
+ const char *value,
+ uint16_t min,
+ uint16_t max,
+ bool ignoring,
+ uint16_t *ret) {
+
+ unsigned t;
+ int r;
+
+ r = config_parse_unsigned_bounded(
+ unit, filename, line, section, section_line, name, value,
+ min, max, ignoring,
+ &t);
+ if (r <= 0)
+ return r;
+ assert(t <= UINT16_MAX);
+ *ret = t;
+ return 1;
+}
+
+static inline int config_parse_uint8_bounded(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *name,
+ const char *value,
+ uint8_t min,
+ uint8_t max,
+ bool ignoring,
+ uint8_t *ret) {
+
+ unsigned t;
+ int r;
+
+ r = config_parse_unsigned_bounded(
+ unit, filename, line, section, section_line, name, value,
+ min, max, ignoring,
+ &t);
+ if (r <= 0)
+ return r;
+ assert(t <= UINT8_MAX);
+ *ret = t;
+ return 1;
+}
diff --git a/src/shared/copy.c b/src/shared/copy.c
new file mode 100644
index 0000000..c0e30cd
--- /dev/null
+++ b/src/shared/copy.c
@@ -0,0 +1,1635 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/btrfs.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <sys/sendfile.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "chattr-util.h"
+#include "copy.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "io-util.h"
+#include "macro.h"
+#include "missing_fs.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "rm-rf.h"
+#include "selinux-util.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "sync-util.h"
+#include "time-util.h"
+#include "tmpfile-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+#include "xattr-util.h"
+
+#define COPY_BUFFER_SIZE (16U*1024U)
+
+/* A safety net for descending recursively into file system trees to copy. On Linux PATH_MAX is 4096, which means the
+ * deepest valid path one can build is around 2048, which we hence use as a safety net here, to not spin endlessly in
+ * case of bind mount cycles and suchlike. */
+#define COPY_DEPTH_MAX 2048U
+
+static ssize_t try_copy_file_range(
+ int fd_in, loff_t *off_in,
+ int fd_out, loff_t *off_out,
+ size_t len,
+ unsigned flags) {
+
+ static int have = -1;
+ ssize_t r;
+
+ if (have == 0)
+ return -ENOSYS;
+
+ r = copy_file_range(fd_in, off_in, fd_out, off_out, len, flags);
+ if (have < 0)
+ have = r >= 0 || errno != ENOSYS;
+ if (r < 0)
+ return -errno;
+
+ return r;
+}
+
+enum {
+ FD_IS_NO_PIPE,
+ FD_IS_BLOCKING_PIPE,
+ FD_IS_NONBLOCKING_PIPE,
+};
+
+static int fd_is_nonblock_pipe(int fd) {
+ struct stat st;
+ int flags;
+
+ /* Checks whether the specified file descriptor refers to a pipe, and if so if O_NONBLOCK is set. */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISFIFO(st.st_mode))
+ return FD_IS_NO_PIPE;
+
+ flags = fcntl(fd, F_GETFL);
+ if (flags < 0)
+ return -errno;
+
+ return FLAGS_SET(flags, O_NONBLOCK) ? FD_IS_NONBLOCKING_PIPE : FD_IS_BLOCKING_PIPE;
+}
+
+static int look_for_signals(CopyFlags copy_flags) {
+ int r;
+
+ if ((copy_flags & (COPY_SIGINT|COPY_SIGTERM)) == 0)
+ return 0;
+
+ r = pop_pending_signal(copy_flags & COPY_SIGINT ? SIGINT : 0,
+ copy_flags & COPY_SIGTERM ? SIGTERM : 0);
+ if (r < 0)
+ return r;
+ if (r != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINTR),
+ "Got %s, cancelling copy operation.", signal_to_string(r));
+
+ return 0;
+}
+
+static int create_hole(int fd, off_t size) {
+ off_t offset;
+ off_t end;
+
+ offset = lseek(fd, 0, SEEK_CUR);
+ if (offset < 0)
+ return -errno;
+
+ end = lseek(fd, 0, SEEK_END);
+ if (end < 0)
+ return -errno;
+
+ /* If we're not at the end of the target file, try to punch a hole in the existing space using fallocate(). */
+
+ if (offset < end &&
+ fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, MIN(size, end - offset)) < 0 &&
+ !ERRNO_IS_NOT_SUPPORTED(errno))
+ return -errno;
+
+ if (end - offset >= size) {
+ /* If we've created the full hole, set the file pointer to the end of the hole we created and exit. */
+ if (lseek(fd, offset + size, SEEK_SET) < 0)
+ return -errno;
+
+ return 0;
+ }
+
+ /* If we haven't created the full hole, use ftruncate() to grow the file (and the hole) to the
+ * required size and move the file pointer to the end of the file. */
+
+ size -= end - offset;
+
+ if (ftruncate(fd, end + size) < 0)
+ return -errno;
+
+ if (lseek(fd, 0, SEEK_END) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int copy_bytes_full(
+ int fdf, int fdt,
+ uint64_t max_bytes,
+ CopyFlags copy_flags,
+ void **ret_remains,
+ size_t *ret_remains_size,
+ copy_progress_bytes_t progress,
+ void *userdata) {
+
+ _cleanup_close_ int fdf_opened = -EBADF, fdt_opened = -EBADF;
+ bool try_cfr = true, try_sendfile = true, try_splice = true, copied_something = false;
+ int r, nonblock_pipe = -1;
+ size_t m = SSIZE_MAX; /* that is the maximum that sendfile and c_f_r accept */
+
+ assert(fdf >= 0);
+ assert(fdt >= 0);
+ assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
+
+ /* Tries to copy bytes from the file descriptor 'fdf' to 'fdt' in the smartest possible way. Copies a maximum
+ * of 'max_bytes', which may be specified as UINT64_MAX, in which no maximum is applied. Returns negative on
+ * error, zero if EOF is hit before the bytes limit is hit and positive otherwise. If the copy fails for some
+ * reason but we read but didn't yet write some data an ret_remains/ret_remains_size is not NULL, then it will
+ * be initialized with an allocated buffer containing this "remaining" data. Note that these two parameters are
+ * initialized with a valid buffer only on failure and only if there's actually data already read. Otherwise
+ * these parameters if non-NULL are set to NULL. */
+
+ if (ret_remains)
+ *ret_remains = NULL;
+ if (ret_remains_size)
+ *ret_remains_size = 0;
+
+ fdf = fd_reopen_condition(fdf, O_CLOEXEC | O_NOCTTY | O_RDONLY, O_PATH, &fdf_opened);
+ if (fdf < 0)
+ return fdf;
+ fdt = fd_reopen_condition(fdt, O_CLOEXEC | O_NOCTTY | O_RDWR, O_PATH, &fdt_opened);
+ if (fdt < 0)
+ return fdt;
+
+ /* Try btrfs reflinks first. This only works on regular, seekable files, hence let's check the file offsets of
+ * source and destination first. */
+ if ((copy_flags & COPY_REFLINK)) {
+ off_t foffset;
+
+ foffset = lseek(fdf, 0, SEEK_CUR);
+ if (foffset >= 0) {
+ off_t toffset;
+
+ toffset = lseek(fdt, 0, SEEK_CUR);
+ if (toffset >= 0) {
+
+ if (foffset == 0 && toffset == 0 && max_bytes == UINT64_MAX)
+ r = reflink(fdf, fdt); /* full file reflink */
+ else
+ r = reflink_range(fdf, foffset, fdt, toffset, max_bytes == UINT64_MAX ? 0 : max_bytes); /* partial reflink */
+ if (r >= 0) {
+ off_t t;
+
+ /* This worked, yay! Now — to be fully correct — let's adjust the file pointers */
+ if (max_bytes == UINT64_MAX) {
+
+ /* We cloned to the end of the source file, let's position the read
+ * pointer there, and query it at the same time. */
+ t = lseek(fdf, 0, SEEK_END);
+ if (t < 0)
+ return -errno;
+ if (t < foffset)
+ return -ESPIPE;
+
+ /* Let's adjust the destination file write pointer by the same number
+ * of bytes. */
+ t = lseek(fdt, toffset + (t - foffset), SEEK_SET);
+ if (t < 0)
+ return -errno;
+
+ return 0; /* we copied the whole thing, hence hit EOF, return 0 */
+ } else {
+ t = lseek(fdf, foffset + max_bytes, SEEK_SET);
+ if (t < 0)
+ return -errno;
+
+ t = lseek(fdt, toffset + max_bytes, SEEK_SET);
+ if (t < 0)
+ return -errno;
+
+ return 1; /* we copied only some number of bytes, which worked, but this means we didn't hit EOF, return 1 */
+ }
+ }
+ }
+ }
+ }
+
+ for (;;) {
+ ssize_t n;
+
+ if (max_bytes <= 0)
+ break;
+
+ r = look_for_signals(copy_flags);
+ if (r < 0)
+ return r;
+
+ if (max_bytes != UINT64_MAX && m > max_bytes)
+ m = max_bytes;
+
+ if (copy_flags & COPY_HOLES) {
+ off_t c, e;
+
+ c = lseek(fdf, 0, SEEK_CUR);
+ if (c < 0)
+ return -errno;
+
+ /* To see if we're in a hole, we search for the next data offset. */
+ e = lseek(fdf, c, SEEK_DATA);
+ if (e < 0 && errno == ENXIO)
+ /* If errno == ENXIO, that means we've reached the final hole of the file and
+ * that hole isn't followed by more data. */
+ e = lseek(fdf, 0, SEEK_END);
+ if (e < 0)
+ return -errno;
+
+ /* If we're in a hole (current offset is not a data offset), create a hole of the
+ * same size in the target file. */
+ if (e > c) {
+ /* Make sure our new hole doesn't go over the maximum size we're allowed to copy. */
+ n = MIN(max_bytes, (uint64_t) e - c);
+ r = create_hole(fdt, n);
+ if (r < 0)
+ return r;
+
+ /* Make sure holes are taken into account in the maximum size we're supposed to copy. */
+ if (max_bytes != UINT64_MAX) {
+ max_bytes -= n;
+ if (max_bytes <= 0)
+ break;
+ }
+
+ /* Update the size we're supposed to copy in this iteration if needed. */
+ if (m > max_bytes)
+ m = max_bytes;
+ }
+
+ c = e; /* Set c to the start of the data segment. */
+
+ /* After copying a potential hole, find the end of the data segment by looking for
+ * the next hole. If we get ENXIO, we're at EOF. */
+ e = lseek(fdf, c, SEEK_HOLE);
+ if (e < 0) {
+ if (errno == ENXIO)
+ break;
+ return -errno;
+ }
+
+ /* SEEK_HOLE modifies the file offset so we need to move back to the initial offset. */
+ if (lseek(fdf, c, SEEK_SET) < 0)
+ return -errno;
+
+ /* Make sure we're not copying more than the current data segment. */
+ m = MIN(m, (size_t) e - c);
+ }
+
+ /* First try copy_file_range(), unless we already tried */
+ if (try_cfr) {
+ n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u);
+ if (n < 0) {
+ if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF))
+ return n;
+
+ try_cfr = false;
+ /* use fallback below */
+ } else if (n == 0) { /* likely EOF */
+
+ if (copied_something)
+ break;
+
+ /* So, we hit EOF immediately, without having copied a single byte. This
+ * could indicate two things: the file is actually empty, or we are on some
+ * virtual file system such as procfs/sysfs where the syscall actually
+ * doesn't work but doesn't return an error. Try to handle that, by falling
+ * back to simple read()s in case we encounter empty files.
+ *
+ * See: https://lwn.net/Articles/846403/ */
+ try_cfr = try_sendfile = try_splice = false;
+ } else
+ /* Success! */
+ goto next;
+ }
+
+ /* First try sendfile(), unless we already tried */
+ if (try_sendfile) {
+ n = sendfile(fdt, fdf, NULL, m);
+ if (n < 0) {
+ if (!IN_SET(errno, EINVAL, ENOSYS))
+ return -errno;
+
+ try_sendfile = false;
+ /* use fallback below */
+ } else if (n == 0) { /* likely EOF */
+
+ if (copied_something)
+ break;
+
+ try_sendfile = try_splice = false; /* same logic as above for copy_file_range() */
+ } else
+ /* Success! */
+ goto next;
+ }
+
+ /* Then try splice, unless we already tried. */
+ if (try_splice) {
+
+ /* splice()'s asynchronous I/O support is a bit weird. When it encounters a pipe file
+ * descriptor, then it will ignore its O_NONBLOCK flag and instead only honour the
+ * SPLICE_F_NONBLOCK flag specified in its flag parameter. Let's hide this behaviour
+ * here, and check if either of the specified fds are a pipe, and if so, let's pass
+ * the flag automatically, depending on O_NONBLOCK being set.
+ *
+ * Here's a twist though: when we use it to move data between two pipes of which one
+ * has O_NONBLOCK set and the other has not, then we have no individual control over
+ * O_NONBLOCK behaviour. Hence in that case we can't use splice() and still guarantee
+ * systematic O_NONBLOCK behaviour, hence don't. */
+
+ if (nonblock_pipe < 0) {
+ int a, b;
+
+ /* Check if either of these fds is a pipe, and if so non-blocking or not */
+ a = fd_is_nonblock_pipe(fdf);
+ if (a < 0)
+ return a;
+
+ b = fd_is_nonblock_pipe(fdt);
+ if (b < 0)
+ return b;
+
+ if ((a == FD_IS_NO_PIPE && b == FD_IS_NO_PIPE) ||
+ (a == FD_IS_BLOCKING_PIPE && b == FD_IS_NONBLOCKING_PIPE) ||
+ (a == FD_IS_NONBLOCKING_PIPE && b == FD_IS_BLOCKING_PIPE))
+
+ /* splice() only works if one of the fds is a pipe. If neither is,
+ * let's skip this step right-away. As mentioned above, if one of the
+ * two fds refers to a blocking pipe and the other to a non-blocking
+ * pipe, we can't use splice() either, hence don't try either. This
+ * hence means we can only use splice() if either only one of the two
+ * fds is a pipe, or if both are pipes with the same nonblocking flag
+ * setting. */
+
+ try_splice = false;
+ else
+ nonblock_pipe = a == FD_IS_NONBLOCKING_PIPE || b == FD_IS_NONBLOCKING_PIPE;
+ }
+ }
+
+ if (try_splice) {
+ n = splice(fdf, NULL, fdt, NULL, m, nonblock_pipe ? SPLICE_F_NONBLOCK : 0);
+ if (n < 0) {
+ if (!IN_SET(errno, EINVAL, ENOSYS))
+ return -errno;
+
+ try_splice = false;
+ /* use fallback below */
+ } else if (n == 0) { /* likely EOF */
+
+ if (copied_something)
+ break;
+
+ try_splice = false; /* same logic as above for copy_file_range() + sendfile() */
+ } else
+ /* Success! */
+ goto next;
+ }
+
+ /* As a fallback just copy bits by hand */
+ {
+ uint8_t buf[MIN(m, COPY_BUFFER_SIZE)], *p = buf;
+ ssize_t z;
+
+ n = read(fdf, buf, sizeof buf);
+ if (n < 0)
+ return -errno;
+ if (n == 0) /* EOF */
+ break;
+
+ z = (size_t) n;
+ do {
+ ssize_t k;
+
+ k = write(fdt, p, z);
+ if (k < 0) {
+ r = -errno;
+
+ if (ret_remains) {
+ void *copy;
+
+ copy = memdup(p, z);
+ if (!copy)
+ return -ENOMEM;
+
+ *ret_remains = copy;
+ }
+
+ if (ret_remains_size)
+ *ret_remains_size = z;
+
+ return r;
+ }
+
+ assert(k <= z);
+ z -= k;
+ p += k;
+ } while (z > 0);
+ }
+
+ next:
+ if (progress) {
+ r = progress(n, userdata);
+ if (r < 0)
+ return r;
+ }
+
+ if (max_bytes != UINT64_MAX) {
+ assert(max_bytes >= (uint64_t) n);
+ max_bytes -= n;
+ }
+
+ /* sendfile accepts at most SSIZE_MAX-offset bytes to copy, so reduce our maximum by the
+ * amount we already copied, but don't go below our copy buffer size, unless we are close the
+ * limit of bytes we are allowed to copy. */
+ m = MAX(MIN(COPY_BUFFER_SIZE, max_bytes), m - n);
+
+ copied_something = true;
+ }
+
+ if (copy_flags & COPY_TRUNCATE) {
+ off_t off = lseek(fdt, 0, SEEK_CUR);
+ if (off < 0)
+ return -errno;
+
+ if (ftruncate(fdt, off) < 0)
+ return -errno;
+ }
+
+ return max_bytes <= 0; /* return 0 if we hit EOF earlier than the size limit */
+}
+
+static int fd_copy_symlink(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags) {
+
+ _cleanup_free_ char *target = NULL;
+ int r;
+
+ assert(from);
+ assert(st);
+ assert(to);
+
+ r = readlinkat_malloc(df, from, &target);
+ if (r < 0)
+ return r;
+
+ if (copy_flags & COPY_MAC_CREATE) {
+ r = mac_selinux_create_file_prepare_at(dt, to, S_IFLNK);
+ if (r < 0)
+ return r;
+ }
+ r = RET_NERRNO(symlinkat(target, dt, to));
+ if (copy_flags & COPY_MAC_CREATE)
+ mac_selinux_create_file_clear();
+ if (r < 0) {
+ if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) {
+ log_notice_errno(r, "Failed to copy symlink '%s', ignoring: %m", from);
+ return 0;
+ }
+
+ return r;
+ }
+
+ if (fchownat(dt, to,
+ uid_is_valid(override_uid) ? override_uid : st->st_uid,
+ gid_is_valid(override_gid) ? override_gid : st->st_gid,
+ AT_SYMLINK_NOFOLLOW) < 0)
+ r = -errno;
+
+ (void) copy_xattr(df, from, dt, to, copy_flags);
+ (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW);
+ return r;
+}
+
+/* Encapsulates the database we store potential hardlink targets in */
+typedef struct HardlinkContext {
+ int dir_fd; /* An fd to the directory we use as lookup table. Never AT_FDCWD. Lazily created, when
+ * we add the first entry. */
+
+ /* These two fields are used to create the hardlink repository directory above — via
+ * mkdirat(parent_fd, subdir) — and are kept so that we can automatically remove the directory again
+ * when we are done. */
+ int parent_fd; /* Possibly AT_FDCWD */
+ char *subdir;
+} HardlinkContext;
+
+static int hardlink_context_setup(
+ HardlinkContext *c,
+ int dt,
+ const char *to,
+ CopyFlags copy_flags) {
+
+ _cleanup_close_ int dt_copy = -EBADF;
+ int r;
+
+ assert(c);
+ assert(c->dir_fd < 0 && c->dir_fd != AT_FDCWD);
+ assert(c->parent_fd < 0);
+ assert(!c->subdir);
+
+ /* If hardlink recreation is requested we have to maintain a database of inodes that are potential
+ * hardlink sources. Given that generally disk sizes have to be assumed to be larger than what fits
+ * into physical RAM we cannot maintain that database in dynamic memory alone. Here we opt to
+ * maintain it on disk, to simplify things: inside the destination directory we'll maintain a
+ * temporary directory consisting of hardlinks of every inode we copied that might be subject of
+ * hardlinks. We can then use that as hardlink source later on. Yes, this means additional disk IO
+ * but thankfully Linux is optimized for this kind of thing. If this ever becomes a performance
+ * bottleneck we can certainly place an in-memory hash table in front of this, but for the beginning,
+ * let's keep things simple, and just use the disk as lookup table for inodes.
+ *
+ * Note that this should have zero performance impact as long as .n_link of all files copied remains
+ * <= 0, because in that case we will not actually allocate the hardlink inode lookup table directory
+ * on disk (we do so lazily, when the first candidate with .n_link > 1 is seen). This means, in the
+ * common case where hardlinks are not used at all or only for few files the fact that we store the
+ * table on disk shouldn't matter perfomance-wise. */
+
+ if (!FLAGS_SET(copy_flags, COPY_HARDLINKS))
+ return 0;
+
+ if (dt == AT_FDCWD)
+ dt_copy = AT_FDCWD;
+ else if (dt < 0)
+ return -EBADF;
+ else {
+ dt_copy = fcntl(dt, F_DUPFD_CLOEXEC, 3);
+ if (dt_copy < 0)
+ return -errno;
+ }
+
+ r = tempfn_random_child(to, "hardlink", &c->subdir);
+ if (r < 0)
+ return r;
+
+ c->parent_fd = TAKE_FD(dt_copy);
+
+ /* We don't actually create the directory we keep the table in here, that's done on-demand when the
+ * first entry is added, using hardlink_context_realize() below. */
+ return 1;
+}
+
+static int hardlink_context_realize(HardlinkContext *c) {
+ if (!c)
+ return 0;
+
+ if (c->dir_fd >= 0) /* Already realized */
+ return 1;
+
+ if (c->parent_fd < 0 && c->parent_fd != AT_FDCWD) /* Not configured */
+ return 0;
+
+ assert(c->subdir);
+
+ c->dir_fd = open_mkdir_at(c->parent_fd, c->subdir, O_EXCL|O_CLOEXEC, 0700);
+ if (c->dir_fd < 0)
+ return c->dir_fd;
+
+ return 1;
+}
+
+static void hardlink_context_destroy(HardlinkContext *c) {
+ int r;
+
+ assert(c);
+
+ /* Automatically remove the hardlink lookup table directory again after we are done. This is used via
+ * _cleanup_() so that we really delete this, even on failure. */
+
+ if (c->dir_fd >= 0) {
+ /* <dir_fd> might be have already been used for reading, so we need to rewind it. */
+ if (lseek(c->dir_fd, 0, SEEK_SET) < 0)
+ log_debug_errno(errno, "Failed to lseek on file descriptor, ignoring: %m");
+
+ r = rm_rf_children(TAKE_FD(c->dir_fd), REMOVE_PHYSICAL, NULL); /* consumes dir_fd in all cases, even on failure */
+ if (r < 0)
+ log_debug_errno(r, "Failed to remove hardlink store (%s) contents, ignoring: %m", c->subdir);
+
+ assert(c->parent_fd >= 0 || c->parent_fd == AT_FDCWD);
+ assert(c->subdir);
+
+ if (unlinkat(c->parent_fd, c->subdir, AT_REMOVEDIR) < 0)
+ log_debug_errno(errno, "Failed to remove hardlink store (%s) directory, ignoring: %m", c->subdir);
+ }
+
+ assert_cc(AT_FDCWD < 0);
+ c->parent_fd = safe_close(c->parent_fd);
+
+ c->subdir = mfree(c->subdir);
+}
+
+static int try_hardlink(
+ HardlinkContext *c,
+ const struct stat *st,
+ int dt,
+ const char *to) {
+
+ char dev_ino[DECIMAL_STR_MAX(dev_t)*2 + DECIMAL_STR_MAX(uint64_t) + 4];
+
+ assert(st);
+ assert(dt >= 0 || dt == AT_FDCWD);
+ assert(to);
+
+ if (!c) /* No temporary hardlink directory, don't bother */
+ return 0;
+
+ if (st->st_nlink <= 1) /* Source not hardlinked, don't bother */
+ return 0;
+
+ if (c->dir_fd < 0) /* not yet realized, hence empty */
+ return 0;
+
+ xsprintf(dev_ino, "%u:%u:%" PRIu64, major(st->st_dev), minor(st->st_dev), (uint64_t) st->st_ino);
+ if (linkat(c->dir_fd, dev_ino, dt, to, 0) < 0) {
+ if (errno != ENOENT) /* doesn't exist in store yet */
+ log_debug_errno(errno, "Failed to hardlink %s to %s, ignoring: %m", dev_ino, to);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int memorize_hardlink(
+ HardlinkContext *c,
+ const struct stat *st,
+ int dt,
+ const char *to) {
+
+ char dev_ino[DECIMAL_STR_MAX(dev_t)*2 + DECIMAL_STR_MAX(uint64_t) + 4];
+ int r;
+
+ assert(st);
+ assert(dt >= 0 || dt == AT_FDCWD);
+ assert(to);
+
+ if (!c) /* No temporary hardlink directory, don't bother */
+ return 0;
+
+ if (st->st_nlink <= 1) /* Source not hardlinked, don't bother */
+ return 0;
+
+ r = hardlink_context_realize(c); /* Create the hardlink store lazily */
+ if (r < 0)
+ return r;
+
+ xsprintf(dev_ino, "%u:%u:%" PRIu64, major(st->st_dev), minor(st->st_dev), (uint64_t) st->st_ino);
+ if (linkat(dt, to, c->dir_fd, dev_ino, 0) < 0) {
+ log_debug_errno(errno, "Failed to hardlink %s to %s, ignoring: %m", to, dev_ino);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int fd_copy_tree_generic(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ dev_t original_device,
+ unsigned depth_left,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ Hashmap *denylist,
+ Set *subvolumes,
+ HardlinkContext *hardlink_context,
+ const char *display_path,
+ copy_progress_path_t progress_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata);
+
+static int fd_copy_regular(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ HardlinkContext *hardlink_context,
+ copy_progress_bytes_t progress,
+ void *userdata) {
+
+ _cleanup_close_ int fdf = -EBADF, fdt = -EBADF;
+ int r, q;
+
+ assert(from);
+ assert(st);
+ assert(to);
+
+ r = try_hardlink(hardlink_context, st, dt, to);
+ if (r < 0)
+ return r;
+ if (r > 0) /* worked! */
+ return 0;
+
+ fdf = openat(df, from, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fdf < 0)
+ return -errno;
+
+ if (copy_flags & COPY_MAC_CREATE) {
+ r = mac_selinux_create_file_prepare_at(dt, to, S_IFREG);
+ if (r < 0)
+ return r;
+ }
+ fdt = openat(dt, to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, st->st_mode & 07777);
+ if (copy_flags & COPY_MAC_CREATE)
+ mac_selinux_create_file_clear();
+ if (fdt < 0)
+ return -errno;
+
+ r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags, NULL, NULL, progress, userdata);
+ if (r < 0)
+ goto fail;
+
+ if (fchown(fdt,
+ uid_is_valid(override_uid) ? override_uid : st->st_uid,
+ gid_is_valid(override_gid) ? override_gid : st->st_gid) < 0)
+ r = -errno;
+
+ if (fchmod(fdt, st->st_mode & 07777) < 0)
+ r = -errno;
+
+ (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim });
+ (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags);
+
+ if (copy_flags & COPY_FSYNC) {
+ if (fsync(fdt) < 0) {
+ r = -errno;
+ goto fail;
+ }
+ }
+
+ q = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */
+ if (q < 0) {
+ r = q;
+ goto fail;
+ }
+
+ (void) memorize_hardlink(hardlink_context, st, dt, to);
+ return r;
+
+fail:
+ (void) unlinkat(dt, to, 0);
+ return r;
+}
+
+static int fd_copy_fifo(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ HardlinkContext *hardlink_context) {
+ int r;
+
+ assert(from);
+ assert(st);
+ assert(to);
+
+ r = try_hardlink(hardlink_context, st, dt, to);
+ if (r < 0)
+ return r;
+ if (r > 0) /* worked! */
+ return 0;
+
+ if (copy_flags & COPY_MAC_CREATE) {
+ r = mac_selinux_create_file_prepare_at(dt, to, S_IFIFO);
+ if (r < 0)
+ return r;
+ }
+ r = RET_NERRNO(mkfifoat(dt, to, st->st_mode & 07777));
+ if (copy_flags & COPY_MAC_CREATE)
+ mac_selinux_create_file_clear();
+ if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) {
+ log_notice_errno(r, "Failed to copy fifo '%s', ignoring: %m", from);
+ return 0;
+ } else if (r < 0)
+ return r;
+
+ if (fchownat(dt, to,
+ uid_is_valid(override_uid) ? override_uid : st->st_uid,
+ gid_is_valid(override_gid) ? override_gid : st->st_gid,
+ AT_SYMLINK_NOFOLLOW) < 0)
+ r = -errno;
+
+ if (fchmodat(dt, to, st->st_mode & 07777, 0) < 0)
+ r = -errno;
+
+ (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW);
+
+ (void) memorize_hardlink(hardlink_context, st, dt, to);
+ return r;
+}
+
+static int fd_copy_node(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ HardlinkContext *hardlink_context) {
+ int r;
+
+ assert(from);
+ assert(st);
+ assert(to);
+
+ r = try_hardlink(hardlink_context, st, dt, to);
+ if (r < 0)
+ return r;
+ if (r > 0) /* worked! */
+ return 0;
+
+ if (copy_flags & COPY_MAC_CREATE) {
+ r = mac_selinux_create_file_prepare_at(dt, to, st->st_mode & S_IFMT);
+ if (r < 0)
+ return r;
+ }
+ r = RET_NERRNO(mknodat(dt, to, st->st_mode, st->st_rdev));
+ if (copy_flags & COPY_MAC_CREATE)
+ mac_selinux_create_file_clear();
+ if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) {
+ log_notice_errno(r, "Failed to copy node '%s', ignoring: %m", from);
+ return 0;
+ } else if (r < 0)
+ return r;
+
+ if (fchownat(dt, to,
+ uid_is_valid(override_uid) ? override_uid : st->st_uid,
+ gid_is_valid(override_gid) ? override_gid : st->st_gid,
+ AT_SYMLINK_NOFOLLOW) < 0)
+ r = -errno;
+
+ if (fchmodat(dt, to, st->st_mode & 07777, 0) < 0)
+ r = -errno;
+
+ (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW);
+
+ (void) memorize_hardlink(hardlink_context, st, dt, to);
+ return r;
+}
+
+static int fd_copy_directory(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ dev_t original_device,
+ unsigned depth_left,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ Hashmap *denylist,
+ Set *subvolumes,
+ HardlinkContext *hardlink_context,
+ const char *display_path,
+ copy_progress_path_t progress_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ _cleanup_(hardlink_context_destroy) HardlinkContext our_hardlink_context = {
+ .dir_fd = -EBADF,
+ .parent_fd = -EBADF,
+ };
+
+ _cleanup_close_ int fdf = -EBADF, fdt = -EBADF;
+ _cleanup_closedir_ DIR *d = NULL;
+ bool exists;
+ int r;
+
+ assert(st);
+ assert(to);
+
+ if (depth_left == 0)
+ return -ENAMETOOLONG;
+
+ if (from)
+ fdf = openat(df, from, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ else
+ fdf = fcntl(df, F_DUPFD_CLOEXEC, 3);
+ if (fdf < 0)
+ return -errno;
+
+ if (!hardlink_context) {
+ /* If recreating hardlinks is requested let's set up a context for that now. */
+ r = hardlink_context_setup(&our_hardlink_context, dt, to, copy_flags);
+ if (r < 0)
+ return r;
+ if (r > 0) /* It's enabled and allocated, let's now use the same context for all recursive
+ * invocations from here down */
+ hardlink_context = &our_hardlink_context;
+ }
+
+ d = take_fdopendir(&fdf);
+ if (!d)
+ return -errno;
+
+ r = dir_is_empty_at(dt, to, /* ignore_hidden_or_backup= */ false);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ if ((r > 0 && !(copy_flags & (COPY_MERGE|COPY_MERGE_EMPTY))) || (r == 0 && !FLAGS_SET(copy_flags, COPY_MERGE)))
+ return -EEXIST;
+
+ exists = r >= 0;
+
+ fdt = xopenat_lock(dt, to,
+ O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|(exists ? 0 : O_CREAT|O_EXCL),
+ (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0)|(set_contains(subvolumes, st) ? XO_SUBVOLUME : 0),
+ st->st_mode & 07777,
+ copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE,
+ LOCK_EX);
+ if (fdt < 0)
+ return fdt;
+
+ r = 0;
+
+ if (PTR_TO_INT(hashmap_get(denylist, st)) == DENY_CONTENTS) {
+ log_debug("%s is in the denylist, not recursing", from);
+ goto finish;
+ }
+
+ FOREACH_DIRENT_ALL(de, d, return -errno) {
+ const char *child_display_path = NULL;
+ _cleanup_free_ char *dp = NULL;
+ struct stat buf;
+ int q;
+
+ if (dot_or_dot_dot(de->d_name))
+ continue;
+
+ r = look_for_signals(copy_flags);
+ if (r < 0)
+ return r;
+
+ if (fstatat(dirfd(d), de->d_name, &buf, AT_SYMLINK_NOFOLLOW) < 0) {
+ r = -errno;
+ continue;
+ }
+
+ if (progress_path) {
+ if (display_path)
+ child_display_path = dp = path_join(display_path, de->d_name);
+ else
+ child_display_path = de->d_name;
+
+ r = progress_path(child_display_path, &buf, userdata);
+ if (r < 0)
+ return r;
+ }
+
+ if (PTR_TO_INT(hashmap_get(denylist, &buf)) == DENY_INODE) {
+ log_debug("%s/%s is in the denylist, ignoring", from, de->d_name);
+ continue;
+ }
+
+ if (S_ISDIR(buf.st_mode)) {
+ /*
+ * Don't descend into directories on other file systems, if this is requested. We do a simple
+ * .st_dev check here, which basically comes for free. Note that we do this check only on
+ * directories, not other kind of file system objects, for two reason:
+ *
+ * • The kernel's overlayfs pseudo file system that overlays multiple real file systems
+ * propagates the .st_dev field of the file system a file originates from all the way up
+ * through the stack to stat(). It doesn't do that for directories however. This means that
+ * comparing .st_dev on non-directories suggests that they all are mount points. To avoid
+ * confusion we hence avoid relying on this check for regular files.
+ *
+ * • The main reason we do this check at all is to protect ourselves from bind mount cycles,
+ * where we really want to avoid descending down in all eternity. However the .st_dev check
+ * is usually not sufficient for this protection anyway, as bind mount cycles from the same
+ * file system onto itself can't be detected that way. (Note we also do a recursion depth
+ * check, which is probably the better protection in this regard, which is why
+ * COPY_SAME_MOUNT is optional).
+ */
+
+ if (FLAGS_SET(copy_flags, COPY_SAME_MOUNT)) {
+ if (buf.st_dev != original_device)
+ continue;
+
+ r = fd_is_mount_point(dirfd(d), de->d_name, 0);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ continue;
+ }
+ }
+
+ q = fd_copy_tree_generic(dirfd(d), de->d_name, &buf, fdt, de->d_name, original_device,
+ depth_left-1, override_uid, override_gid, copy_flags & ~COPY_LOCK_BSD,
+ denylist, subvolumes, hardlink_context, child_display_path, progress_path,
+ progress_bytes, userdata);
+
+ if (q == -EINTR) /* Propagate SIGINT/SIGTERM up instantly */
+ return q;
+ if (q == -EEXIST && (copy_flags & COPY_MERGE))
+ q = 0;
+ if (q < 0)
+ r = q;
+ }
+
+finish:
+ if (!exists) {
+ if (fchown(fdt,
+ uid_is_valid(override_uid) ? override_uid : st->st_uid,
+ gid_is_valid(override_gid) ? override_gid : st->st_gid) < 0)
+ r = -errno;
+
+ if (fchmod(fdt, st->st_mode & 07777) < 0)
+ r = -errno;
+
+ (void) copy_xattr(dirfd(d), NULL, fdt, NULL, copy_flags);
+ (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim });
+ }
+
+ if (copy_flags & COPY_FSYNC_FULL) {
+ if (fsync(fdt) < 0)
+ return -errno;
+ }
+
+ if (r < 0)
+ return r;
+
+ return copy_flags & COPY_LOCK_BSD ? TAKE_FD(fdt) : 0;
+}
+
+static int fd_copy_leaf(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ HardlinkContext *hardlink_context,
+ const char *display_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+ int r;
+
+ if (S_ISREG(st->st_mode))
+ r = fd_copy_regular(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, progress_bytes, userdata);
+ else if (S_ISLNK(st->st_mode))
+ r = fd_copy_symlink(df, from, st, dt, to, override_uid, override_gid, copy_flags);
+ else if (S_ISFIFO(st->st_mode))
+ r = fd_copy_fifo(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context);
+ else if (S_ISBLK(st->st_mode) || S_ISCHR(st->st_mode) || S_ISSOCK(st->st_mode))
+ r = fd_copy_node(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context);
+ else
+ r = -EOPNOTSUPP;
+
+ return r;
+}
+
+static int fd_copy_tree_generic(
+ int df,
+ const char *from,
+ const struct stat *st,
+ int dt,
+ const char *to,
+ dev_t original_device,
+ unsigned depth_left,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ Hashmap *denylist,
+ Set *subvolumes,
+ HardlinkContext *hardlink_context,
+ const char *display_path,
+ copy_progress_path_t progress_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ int r;
+
+ assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
+
+ if (S_ISDIR(st->st_mode))
+ return fd_copy_directory(df, from, st, dt, to, original_device, depth_left-1, override_uid,
+ override_gid, copy_flags, denylist, subvolumes, hardlink_context,
+ display_path, progress_path, progress_bytes, userdata);
+
+ DenyType t = PTR_TO_INT(hashmap_get(denylist, st));
+ if (t == DENY_INODE) {
+ log_debug("%s is in the denylist, ignoring", from);
+ return 0;
+ } else if (t == DENY_CONTENTS)
+ log_debug("%s is configured to have its contents excluded, but is not a directory", from);
+
+ r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata);
+ /* We just tried to copy a leaf node of the tree. If it failed because the node already exists *and* the COPY_REPLACE flag has been provided, we should unlink the node and re-copy. */
+ if (r == -EEXIST && (copy_flags & COPY_REPLACE)) {
+ /* This codepath is us trying to address an error to copy, if the unlink fails, lets just return the original error. */
+ if (unlinkat(dt, to, 0) < 0)
+ return r;
+
+ r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata);
+ }
+
+ return r;
+}
+
+int copy_tree_at_full(
+ int fdf,
+ const char *from,
+ int fdt,
+ const char *to,
+ uid_t override_uid,
+ gid_t override_gid,
+ CopyFlags copy_flags,
+ Hashmap *denylist,
+ Set *subvolumes,
+ copy_progress_path_t progress_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ struct stat st;
+ int r;
+
+ assert(from);
+ assert(to);
+ assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
+
+ if (fstatat(fdf, from, &st, AT_SYMLINK_NOFOLLOW) < 0)
+ return -errno;
+
+ r = fd_copy_tree_generic(fdf, from, &st, fdt, to, st.st_dev, COPY_DEPTH_MAX, override_uid,
+ override_gid, copy_flags, denylist, subvolumes, NULL, NULL, progress_path,
+ progress_bytes, userdata);
+ if (r < 0)
+ return r;
+
+ if (S_ISDIR(st.st_mode) && (copy_flags & COPY_SYNCFS)) {
+ /* If the top-level inode is a directory run syncfs() now. */
+ r = syncfs_path(fdt, to);
+ if (r < 0)
+ return r;
+ } else if ((copy_flags & (COPY_FSYNC_FULL|COPY_SYNCFS)) != 0) {
+ /* fsync() the parent dir of what we just copied if COPY_FSYNC_FULL is set. Also do this in
+ * case COPY_SYNCFS is set but the top-level inode wasn't actually a directory. We do this so that
+ * COPY_SYNCFS provides reasonable synchronization semantics on any kind of inode: when the
+ * copy operation is done the whole inode — regardless of its type — and all its children
+ * will be synchronized to disk. */
+ r = fsync_parent_at(fdt, to);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int sync_dir_by_flags(int dir_fd, const char *path, CopyFlags copy_flags) {
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(path);
+
+ if (copy_flags & COPY_SYNCFS)
+ return syncfs_path(dir_fd, path);
+ if (copy_flags & COPY_FSYNC_FULL)
+ return fsync_parent_at(dir_fd, path);
+
+ return 0;
+}
+
+int copy_directory_at_full(
+ int dir_fdf,
+ const char *from,
+ int dir_fdt,
+ const char *to,
+ CopyFlags copy_flags,
+ copy_progress_path_t progress_path,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ _cleanup_close_ int fdt = -EBADF;
+ struct stat st;
+ int r;
+
+ assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
+ assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD);
+ assert(to);
+
+ if (fstatat(dir_fdf, strempty(from), &st, AT_SYMLINK_NOFOLLOW|(isempty(from) ? AT_EMPTY_PATH : 0)) < 0)
+ return -errno;
+
+ r = stat_verify_directory(&st);
+ if (r < 0)
+ return r;
+
+ r = fd_copy_directory(
+ dir_fdf, from,
+ &st,
+ dir_fdt, to,
+ st.st_dev,
+ COPY_DEPTH_MAX,
+ UID_INVALID, GID_INVALID,
+ copy_flags,
+ NULL, NULL, NULL, NULL,
+ progress_path,
+ progress_bytes,
+ userdata);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(copy_flags, COPY_LOCK_BSD))
+ fdt = r;
+
+ r = sync_dir_by_flags(dir_fdt, to, copy_flags);
+ if (r < 0)
+ return r;
+
+ return FLAGS_SET(copy_flags, COPY_LOCK_BSD) ? TAKE_FD(fdt) : 0;
+}
+
+int copy_file_fd_at_full(
+ int dir_fdf,
+ const char *from,
+ int fdt,
+ CopyFlags copy_flags,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ _cleanup_close_ int fdf = -EBADF;
+ struct stat st;
+ int r;
+
+ assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
+ assert(from);
+ assert(fdt >= 0);
+ assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
+
+ fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (fdf < 0)
+ return -errno;
+
+ r = fd_verify_regular(fdf);
+ if (r < 0)
+ return r;
+
+ if (fstat(fdt, &st) < 0)
+ return -errno;
+
+ r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags, NULL, NULL, progress_bytes, userdata);
+ if (r < 0)
+ return r;
+
+ /* Make sure to copy file attributes only over if target is a regular
+ * file (so that copying a file to /dev/null won't alter the access
+ * mode/ownership of that device node...) */
+ if (S_ISREG(st.st_mode)) {
+ (void) copy_times(fdf, fdt, copy_flags);
+ (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags);
+ }
+
+ if (copy_flags & COPY_FSYNC_FULL) {
+ r = fsync_full(fdt);
+ if (r < 0)
+ return r;
+ } else if (copy_flags & COPY_FSYNC) {
+ if (fsync(fdt) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+int copy_file_at_full(
+ int dir_fdf,
+ const char *from,
+ int dir_fdt,
+ const char *to,
+ int flags,
+ mode_t mode,
+ unsigned chattr_flags,
+ unsigned chattr_mask,
+ CopyFlags copy_flags,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ _cleanup_close_ int fdf = -EBADF, fdt = -EBADF;
+ struct stat st;
+ int r;
+
+ assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
+ assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD);
+ assert(from);
+ assert(to);
+
+ fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (fdf < 0)
+ return -errno;
+
+ if (fstat(fdf, &st) < 0)
+ return -errno;
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return r;
+
+ WITH_UMASK(0000) {
+ fdt = xopenat_lock(dir_fdt, to,
+ flags|O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY,
+ (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0),
+ mode != MODE_INVALID ? mode : st.st_mode,
+ copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, LOCK_EX);
+ if (fdt < 0)
+ return fdt;
+ }
+
+ if (!FLAGS_SET(flags, O_EXCL)) { /* if O_EXCL was used we created the thing as regular file, no need to check again */
+ r = fd_verify_regular(fdt);
+ if (r < 0)
+ goto fail;
+ }
+
+ if (chattr_mask != 0)
+ (void) chattr_fd(fdt, chattr_flags, chattr_mask & CHATTR_EARLY_FL, NULL);
+
+ r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags & ~COPY_LOCK_BSD, NULL, NULL, progress_bytes, userdata);
+ if (r < 0)
+ goto fail;
+
+ (void) copy_times(fdf, fdt, copy_flags);
+ (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags);
+
+ if (chattr_mask != 0)
+ (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL);
+
+ if (copy_flags & (COPY_FSYNC|COPY_FSYNC_FULL)) {
+ if (fsync(fdt) < 0) {
+ r = -errno;
+ goto fail;
+ }
+ }
+
+ if (!FLAGS_SET(copy_flags, COPY_LOCK_BSD)) {
+ r = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */
+ if (r < 0)
+ goto fail;
+ }
+
+ if (copy_flags & COPY_FSYNC_FULL) {
+ r = fsync_parent_at(dir_fdt, to);
+ if (r < 0)
+ goto fail;
+ }
+
+ return copy_flags & COPY_LOCK_BSD ? TAKE_FD(fdt) : 0;
+
+fail:
+ /* Only unlink if we definitely are the ones who created the file */
+ if (FLAGS_SET(flags, O_EXCL))
+ (void) unlinkat(dir_fdt, to, 0);
+
+ return r;
+}
+
+int copy_file_atomic_at_full(
+ int dir_fdf,
+ const char *from,
+ int dir_fdt,
+ const char *to,
+ mode_t mode,
+ unsigned chattr_flags,
+ unsigned chattr_mask,
+ CopyFlags copy_flags,
+ copy_progress_bytes_t progress_bytes,
+ void *userdata) {
+
+ _cleanup_(unlink_and_freep) char *t = NULL;
+ _cleanup_close_ int fdt = -EBADF;
+ int r;
+
+ assert(from);
+ assert(to);
+ assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
+
+ if (copy_flags & COPY_MAC_CREATE) {
+ r = mac_selinux_create_file_prepare_at(dir_fdt, to, S_IFREG);
+ if (r < 0)
+ return r;
+ }
+ fdt = open_tmpfile_linkable_at(dir_fdt, to, O_WRONLY|O_CLOEXEC, &t);
+ if (copy_flags & COPY_MAC_CREATE)
+ mac_selinux_create_file_clear();
+ if (fdt < 0)
+ return fdt;
+
+ if (chattr_mask != 0)
+ (void) chattr_fd(fdt, chattr_flags, chattr_mask & CHATTR_EARLY_FL, NULL);
+
+ r = copy_file_fd_at_full(dir_fdf, from, fdt, copy_flags, progress_bytes, userdata);
+ if (r < 0)
+ return r;
+
+ if (fchmod(fdt, mode) < 0)
+ return -errno;
+
+ if ((copy_flags & (COPY_FSYNC|COPY_FSYNC_FULL))) {
+ /* Sync the file */
+ if (fsync(fdt) < 0)
+ return -errno;
+ }
+
+ r = link_tmpfile_at(fdt, dir_fdt, t, to, (copy_flags & COPY_REPLACE) ? LINK_TMPFILE_REPLACE : 0);
+ if (r < 0)
+ return r;
+
+ t = mfree(t);
+
+ if (chattr_mask != 0)
+ (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL);
+
+ r = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */
+ if (r < 0)
+ goto fail;
+
+ if (copy_flags & COPY_FSYNC_FULL) {
+ /* Sync the parent directory */
+ r = fsync_parent_at(dir_fdt, to);
+ if (r < 0)
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ (void) unlinkat(dir_fdt, to, 0);
+ return r;
+}
+
+int copy_times(int fdf, int fdt, CopyFlags flags) {
+ struct stat st;
+
+ assert(fdf >= 0);
+ assert(fdt >= 0);
+
+ if (fstat(fdf, &st) < 0)
+ return -errno;
+
+ if (futimens(fdt, (struct timespec[2]) { st.st_atim, st.st_mtim }) < 0)
+ return -errno;
+
+ if (FLAGS_SET(flags, COPY_CRTIME)) {
+ usec_t crtime;
+
+ if (fd_getcrtime(fdf, &crtime) >= 0)
+ (void) fd_setcrtime(fdt, crtime);
+ }
+
+ return 0;
+}
+
+int copy_access(int fdf, int fdt) {
+ struct stat st;
+
+ assert(fdf >= 0);
+ assert(fdt >= 0);
+
+ /* Copies just the access mode (and not the ownership) from fdf to fdt */
+
+ if (fstat(fdf, &st) < 0)
+ return -errno;
+
+ return RET_NERRNO(fchmod(fdt, st.st_mode & 07777));
+}
+
+int copy_rights_with_fallback(int fdf, int fdt, const char *patht) {
+ struct stat st;
+
+ assert(fdf >= 0);
+ assert(fdt >= 0);
+
+ /* Copies both access mode and ownership from fdf to fdt */
+
+ if (fstat(fdf, &st) < 0)
+ return -errno;
+
+ return fchmod_and_chown_with_fallback(fdt, patht, st.st_mode & 07777, st.st_uid, st.st_gid);
+}
+
+int copy_xattr(int df, const char *from, int dt, const char *to, CopyFlags copy_flags) {
+ _cleanup_free_ char *names = NULL;
+ int ret = 0, r;
+
+ r = listxattr_at_malloc(df, from, 0, &names);
+ if (r < 0)
+ return r;
+
+ NULSTR_FOREACH(p, names) {
+ _cleanup_free_ char *value = NULL;
+
+ if (!FLAGS_SET(copy_flags, COPY_ALL_XATTRS) && !startswith(p, "user."))
+ continue;
+
+ r = getxattr_at_malloc(df, from, p, 0, &value);
+ if (r == -ENODATA)
+ continue; /* gone by now */
+ if (r < 0)
+ return r;
+
+ if (xsetxattr(dt, to, p, value, r, 0) < 0)
+ ret = -errno;
+ }
+
+ return ret;
+}
+
+int reflink(int infd, int outfd) {
+ int r;
+
+ assert(infd >= 0);
+ assert(outfd >= 0);
+
+ /* Make sure we invoke the ioctl on a regular file, so that no device driver accidentally gets it. */
+
+ r = fd_verify_regular(outfd);
+ if (r < 0)
+ return r;
+
+ /* FICLONE was introduced in Linux 4.5 but it uses the same number as BTRFS_IOC_CLONE introduced earlier */
+
+ assert_cc(FICLONE == BTRFS_IOC_CLONE);
+
+ return RET_NERRNO(ioctl(outfd, FICLONE, infd));
+}
+
+assert_cc(sizeof(struct file_clone_range) == sizeof(struct btrfs_ioctl_clone_range_args));
+
+int reflink_range(int infd, uint64_t in_offset, int outfd, uint64_t out_offset, uint64_t sz) {
+ struct file_clone_range args = {
+ .src_fd = infd,
+ .src_offset = in_offset,
+ .src_length = sz,
+ .dest_offset = out_offset,
+ };
+ int r;
+
+ assert(infd >= 0);
+ assert(outfd >= 0);
+
+ /* Inside the kernel, FICLONE is identical to FICLONERANGE with offsets and size set to zero, let's
+ * simplify things and use the simple ioctl in that case. Also, do the same if the size is
+ * UINT64_MAX, which is how we usually encode "everything". */
+ if (in_offset == 0 && out_offset == 0 && IN_SET(sz, 0, UINT64_MAX))
+ return reflink(infd, outfd);
+
+ r = fd_verify_regular(outfd);
+ if (r < 0)
+ return r;
+
+ assert_cc(FICLONERANGE == BTRFS_IOC_CLONE_RANGE);
+
+ return RET_NERRNO(ioctl(outfd, FICLONERANGE, &args));
+}
diff --git a/src/shared/copy.h b/src/shared/copy.h
new file mode 100644
index 0000000..d842edd
--- /dev/null
+++ b/src/shared/copy.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "set.h"
+
+typedef enum CopyFlags {
+ COPY_REFLINK = 1 << 0, /* Try to reflink */
+ COPY_MERGE = 1 << 1, /* Merge existing trees with our new one to copy */
+ COPY_REPLACE = 1 << 2, /* Replace an existing file if there's one */
+ COPY_SAME_MOUNT = 1 << 3, /* Don't descend recursively into other file systems, across mount point boundaries */
+ COPY_MERGE_EMPTY = 1 << 4, /* Merge an existing, empty directory with our new tree to copy */
+ COPY_CRTIME = 1 << 5, /* Generate a user.crtime_usec xattr off the source crtime if there is one, on copying */
+ COPY_SIGINT = 1 << 6, /* Check for SIGINT regularly and return EINTR if seen (caller needs to block SIGINT) */
+ COPY_SIGTERM = 1 << 7, /* ditto, but for SIGTERM */
+ COPY_MAC_CREATE = 1 << 8, /* Create files with the correct MAC label (currently SELinux only) */
+ COPY_HARDLINKS = 1 << 9, /* Try to reproduce hard links */
+ COPY_FSYNC = 1 << 10, /* fsync() after we are done */
+ COPY_FSYNC_FULL = 1 << 11, /* fsync_full() after we are done */
+ COPY_SYNCFS = 1 << 12, /* syncfs() the *top-level* dir after we are done */
+ COPY_ALL_XATTRS = 1 << 13, /* Preserve all xattrs when copying, not just those in the user namespace */
+ COPY_HOLES = 1 << 14, /* Copy holes */
+ COPY_GRACEFUL_WARN = 1 << 15, /* Skip copying file types that aren't supported by the target filesystem */
+ COPY_TRUNCATE = 1 << 16, /* Truncate to current file offset after copying */
+ COPY_LOCK_BSD = 1 << 17, /* Return a BSD exclusively locked file descriptor referring to the copied image/directory. */
+} CopyFlags;
+
+typedef enum DenyType {
+ DENY_DONT = 0, /* we want INT_TO_PTR(DENY_DONT) to map to NULL */
+ DENY_INODE,
+ DENY_CONTENTS,
+ _DENY_TYPE_MAX,
+ _DENY_TYPE_INVALID = -EINVAL,
+} DenyType;
+
+typedef int (*copy_progress_bytes_t)(uint64_t n_bytes, void *userdata);
+typedef int (*copy_progress_path_t)(const char *path, const struct stat *st, void *userdata);
+
+int copy_file_fd_at_full(int dir_fdf, const char *from, int to, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata);
+static inline int copy_file_fd_at(int dir_fdf, const char *from, int to, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata) {
+ return copy_file_fd_at_full(dir_fdf, from, to, copy_flags, progress, userdata);
+}
+static inline int copy_file_fd_full(const char *from, int to, CopyFlags copy_flags) {
+ return copy_file_fd_at_full(AT_FDCWD, from, to, copy_flags, NULL, NULL);
+}
+static inline int copy_file_fd(const char *from, int to, CopyFlags copy_flags) {
+ return copy_file_fd_at(AT_FDCWD, from, to, copy_flags, NULL, NULL);
+}
+
+int copy_file_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, int open_flags, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata);
+static inline int copy_file_at(int dir_fdf, const char *from, int dir_fdt, const char *to, int open_flags, mode_t mode, CopyFlags copy_flags) {
+ return copy_file_at_full(dir_fdf, from, dir_fdt, to, open_flags, mode, 0, 0, copy_flags, NULL, NULL);
+}
+static inline int copy_file_full(const char *from, const char *to, int open_flags, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata) {
+ return copy_file_at_full(AT_FDCWD, from, AT_FDCWD, to, open_flags, mode, chattr_flags, chattr_mask, copy_flags, progress, userdata);
+}
+static inline int copy_file(const char *from, const char *to, int open_flags, mode_t mode, CopyFlags copy_flags) {
+ return copy_file_at(AT_FDCWD, from, AT_FDCWD, to, open_flags, mode, copy_flags);
+}
+
+int copy_file_atomic_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata);
+static inline int copy_file_atomic_at(int dir_fdf, const char *from, int dir_fdt, const char *to, mode_t mode, CopyFlags copy_flags) {
+ return copy_file_atomic_at_full(dir_fdf, from, dir_fdt, to, mode, 0, 0, copy_flags, NULL, NULL);
+}
+static inline int copy_file_atomic_full(const char *from, const char *to, mode_t mode, unsigned chattr_flags, unsigned chattr_mask, CopyFlags copy_flags, copy_progress_bytes_t progress, void *userdata) {
+ return copy_file_atomic_at_full(AT_FDCWD, from, AT_FDCWD, to, mode, chattr_flags, chattr_mask, copy_flags, progress, userdata);
+}
+static inline int copy_file_atomic(const char *from, const char *to, mode_t mode, CopyFlags copy_flags) {
+ return copy_file_atomic_full(from, to, mode, 0, 0, copy_flags, NULL, NULL);
+}
+
+int copy_tree_at_full(int fdf, const char *from, int fdt, const char *to, uid_t override_uid, gid_t override_gid, CopyFlags copy_flags, Hashmap *denylist, Set *subvolumes, copy_progress_path_t progress_path, copy_progress_bytes_t progress_bytes, void *userdata);
+static inline int copy_tree_at(int fdf, const char *from, int fdt, const char *to, uid_t override_uid, gid_t override_gid, CopyFlags copy_flags, Hashmap *denylist, Set *subvolumes) {
+ return copy_tree_at_full(fdf, from, fdt, to, override_uid, override_gid, copy_flags, denylist, subvolumes, NULL, NULL, NULL);
+}
+static inline int copy_tree(const char *from, const char *to, uid_t override_uid, gid_t override_gid, CopyFlags copy_flags, Hashmap *denylist, Set *subvolumes) {
+ return copy_tree_at_full(AT_FDCWD, from, AT_FDCWD, to, override_uid, override_gid, copy_flags, denylist, subvolumes, NULL, NULL, NULL);
+}
+
+int copy_directory_at_full(int dir_fdf, const char *from, int dir_fdt, const char *to, CopyFlags copy_flags, copy_progress_path_t progress_path, copy_progress_bytes_t progress_bytes, void *userdata);
+static inline int copy_directory_at(int dir_fdf, const char *from, int dir_fdt, const char *to, CopyFlags copy_flags) {
+ return copy_directory_at_full(dir_fdf, from, dir_fdt, to, copy_flags, NULL, NULL, NULL);
+}
+
+int copy_bytes_full(int fdf, int fdt, uint64_t max_bytes, CopyFlags copy_flags, void **ret_remains, size_t *ret_remains_size, copy_progress_bytes_t progress, void *userdata);
+static inline int copy_bytes(int fdf, int fdt, uint64_t max_bytes, CopyFlags copy_flags) {
+ return copy_bytes_full(fdf, fdt, max_bytes, copy_flags, NULL, NULL, NULL, NULL);
+}
+
+int copy_times(int fdf, int fdt, CopyFlags flags);
+int copy_access(int fdf, int fdt);
+int copy_rights_with_fallback(int fdf, int fdt, const char *patht);
+static inline int copy_rights(int fdf, int fdt) {
+ return copy_rights_with_fallback(fdf, fdt, NULL); /* no fallback */
+}
+int copy_xattr(int df, const char *from, int dt, const char *to, CopyFlags copy_flags);
+
+int reflink(int infd, int outfd);
+int reflink_range(int infd, uint64_t in_offset, int outfd, uint64_t out_offset, uint64_t sz);
diff --git a/src/shared/coredump-util.c b/src/shared/coredump-util.c
new file mode 100644
index 0000000..805503f
--- /dev/null
+++ b/src/shared/coredump-util.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <elf.h>
+
+#include "coredump-util.h"
+#include "extract-word.h"
+#include "fileio.h"
+#include "string-table.h"
+#include "unaligned.h"
+#include "virt.h"
+
+static const char *const coredump_filter_table[_COREDUMP_FILTER_MAX] = {
+ [COREDUMP_FILTER_PRIVATE_ANONYMOUS] = "private-anonymous",
+ [COREDUMP_FILTER_SHARED_ANONYMOUS] = "shared-anonymous",
+ [COREDUMP_FILTER_PRIVATE_FILE_BACKED] = "private-file-backed",
+ [COREDUMP_FILTER_SHARED_FILE_BACKED] = "shared-file-backed",
+ [COREDUMP_FILTER_ELF_HEADERS] = "elf-headers",
+ [COREDUMP_FILTER_PRIVATE_HUGE] = "private-huge",
+ [COREDUMP_FILTER_SHARED_HUGE] = "shared-huge",
+ [COREDUMP_FILTER_PRIVATE_DAX] = "private-dax",
+ [COREDUMP_FILTER_SHARED_DAX] = "shared-dax",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(coredump_filter, CoredumpFilter);
+
+int coredump_filter_mask_from_string(const char *s, uint64_t *ret) {
+ uint64_t m = 0;
+
+ assert(s);
+ assert(ret);
+
+ for (;;) {
+ _cleanup_free_ char *n = NULL;
+ CoredumpFilter v;
+ int r;
+
+ r = extract_first_word(&s, &n, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (streq(n, "default")) {
+ m |= COREDUMP_FILTER_MASK_DEFAULT;
+ continue;
+ }
+
+ if (streq(n, "all")) {
+ m = COREDUMP_FILTER_MASK_ALL;
+ continue;
+ }
+
+ v = coredump_filter_from_string(n);
+ if (v >= 0) {
+ m |= 1u << v;
+ continue;
+ }
+
+ uint64_t x;
+ r = safe_atoux64(n, &x);
+ if (r < 0)
+ return r;
+
+ m |= x;
+ }
+
+ *ret = m;
+ return 0;
+}
+
+#define _DEFINE_PARSE_AUXV(size, type, unaligned_read) \
+ static int parse_auxv##size( \
+ int log_level, \
+ const void *auxv, \
+ size_t size_bytes, \
+ int *at_secure, \
+ uid_t *uid, \
+ uid_t *euid, \
+ gid_t *gid, \
+ gid_t *egid) { \
+ \
+ assert(auxv || size_bytes == 0); \
+ assert(at_secure); \
+ assert(uid); \
+ assert(euid); \
+ assert(gid); \
+ assert(egid); \
+ \
+ if (size_bytes % (2 * sizeof(type)) != 0) \
+ return log_full_errno(log_level, \
+ SYNTHETIC_ERRNO(EIO), \
+ "Incomplete auxv structure (%zu bytes).", \
+ size_bytes); \
+ \
+ size_t words = size_bytes / sizeof(type); \
+ \
+ /* Note that we set output variables even on error. */ \
+ \
+ for (size_t i = 0; i + 1 < words; i += 2) { \
+ type key, val; \
+ \
+ key = unaligned_read((uint8_t*) auxv + i * sizeof(type)); \
+ val = unaligned_read((uint8_t*) auxv + (i + 1) * sizeof(type)); \
+ \
+ switch (key) { \
+ case AT_SECURE: \
+ *at_secure = val != 0; \
+ break; \
+ case AT_UID: \
+ *uid = val; \
+ break; \
+ case AT_EUID: \
+ *euid = val; \
+ break; \
+ case AT_GID: \
+ *gid = val; \
+ break; \
+ case AT_EGID: \
+ *egid = val; \
+ break; \
+ case AT_NULL: \
+ if (val != 0) \
+ goto error; \
+ return 0; \
+ } \
+ } \
+ error: \
+ return log_full_errno(log_level, \
+ SYNTHETIC_ERRNO(ENODATA), \
+ "AT_NULL terminator not found, cannot parse auxv structure."); \
+ }
+
+#define DEFINE_PARSE_AUXV(size) \
+ _DEFINE_PARSE_AUXV(size, uint##size##_t, unaligned_read_ne##size)
+
+DEFINE_PARSE_AUXV(32);
+DEFINE_PARSE_AUXV(64);
+
+int parse_auxv(int log_level,
+ uint8_t elf_class,
+ const void *auxv,
+ size_t size_bytes,
+ int *at_secure,
+ uid_t *uid,
+ uid_t *euid,
+ gid_t *gid,
+ gid_t *egid) {
+
+ switch (elf_class) {
+ case ELFCLASS64:
+ return parse_auxv64(log_level, auxv, size_bytes, at_secure, uid, euid, gid, egid);
+ case ELFCLASS32:
+ return parse_auxv32(log_level, auxv, size_bytes, at_secure, uid, euid, gid, egid);
+ default:
+ return log_full_errno(log_level, SYNTHETIC_ERRNO(EPROTONOSUPPORT),
+ "Unknown ELF class %d.", elf_class);
+ }
+}
+
+int set_coredump_filter(uint64_t value) {
+ char t[HEXADECIMAL_STR_MAX(uint64_t)];
+
+ xsprintf(t, "0x%"PRIx64, value);
+
+ return write_string_file("/proc/self/coredump_filter", t,
+ WRITE_STRING_FILE_VERIFY_ON_FAILURE|WRITE_STRING_FILE_DISABLE_BUFFER);
+}
+
+/* Turn off core dumps but only if we're running outside of a container. */
+void disable_coredumps(void) {
+ int r;
+
+ if (detect_container() > 0)
+ return;
+
+ r = write_string_file("/proc/sys/kernel/core_pattern", "|/bin/false", WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ log_debug_errno(r, "Failed to turn off coredumps, ignoring: %m");
+}
diff --git a/src/shared/coredump-util.h b/src/shared/coredump-util.h
new file mode 100644
index 0000000..4f54bb9
--- /dev/null
+++ b/src/shared/coredump-util.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+typedef enum CoredumpFilter {
+ COREDUMP_FILTER_PRIVATE_ANONYMOUS = 0,
+ COREDUMP_FILTER_SHARED_ANONYMOUS,
+ COREDUMP_FILTER_PRIVATE_FILE_BACKED,
+ COREDUMP_FILTER_SHARED_FILE_BACKED,
+ COREDUMP_FILTER_ELF_HEADERS,
+ COREDUMP_FILTER_PRIVATE_HUGE,
+ COREDUMP_FILTER_SHARED_HUGE,
+ COREDUMP_FILTER_PRIVATE_DAX,
+ COREDUMP_FILTER_SHARED_DAX,
+ _COREDUMP_FILTER_MAX,
+ _COREDUMP_FILTER_INVALID = -EINVAL,
+} CoredumpFilter;
+
+#define COREDUMP_FILTER_MASK_DEFAULT (1u << COREDUMP_FILTER_PRIVATE_ANONYMOUS | \
+ 1u << COREDUMP_FILTER_SHARED_ANONYMOUS | \
+ 1u << COREDUMP_FILTER_ELF_HEADERS | \
+ 1u << COREDUMP_FILTER_PRIVATE_HUGE)
+
+/* The kernel doesn't like UINT64_MAX and returns ERANGE, use UINT32_MAX to support future new flags */
+#define COREDUMP_FILTER_MASK_ALL UINT32_MAX
+
+const char* coredump_filter_to_string(CoredumpFilter i) _const_;
+CoredumpFilter coredump_filter_from_string(const char *s) _pure_;
+int coredump_filter_mask_from_string(const char *s, uint64_t *ret);
+
+int parse_auxv(int log_level,
+ uint8_t elf_class,
+ const void *auxv,
+ size_t size_bytes,
+ int *at_secure,
+ uid_t *uid,
+ uid_t *euid,
+ gid_t *gid,
+ gid_t *egid);
+
+int set_coredump_filter(uint64_t value);
+void disable_coredumps(void);
diff --git a/src/shared/cpu-set-util.c b/src/shared/cpu-set-util.c
new file mode 100644
index 0000000..d096576
--- /dev/null
+++ b/src/shared/cpu-set-util.c
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <syslog.h>
+
+#include "alloc-util.h"
+#include "cpu-set-util.h"
+#include "dirent-util.h"
+#include "errno-util.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "parse-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+
+char* cpu_set_to_string(const CPUSet *a) {
+ _cleanup_free_ char *str = NULL;
+ size_t len = 0;
+ int i, r;
+
+ for (i = 0; (size_t) i < a->allocated * 8; i++) {
+ if (!CPU_ISSET_S(i, a->allocated, a->set))
+ continue;
+
+ if (!GREEDY_REALLOC(str, len + 1 + DECIMAL_STR_MAX(int)))
+ return NULL;
+
+ r = sprintf(str + len, len > 0 ? " %d" : "%d", i);
+ assert_se(r > 0);
+ len += r;
+ }
+
+ return TAKE_PTR(str) ?: strdup("");
+}
+
+char *cpu_set_to_range_string(const CPUSet *set) {
+ unsigned range_start = 0, range_end;
+ _cleanup_free_ char *str = NULL;
+ bool in_range = false;
+ size_t len = 0;
+ int r;
+
+ for (unsigned i = 0; i < set->allocated * 8; i++)
+ if (CPU_ISSET_S(i, set->allocated, set->set)) {
+ if (in_range)
+ range_end++;
+ else {
+ range_start = range_end = i;
+ in_range = true;
+ }
+ } else if (in_range) {
+ in_range = false;
+
+ if (!GREEDY_REALLOC(str, len + 2 + 2 * DECIMAL_STR_MAX(unsigned)))
+ return NULL;
+
+ if (range_end > range_start)
+ r = sprintf(str + len, len > 0 ? " %u-%u" : "%u-%u", range_start, range_end);
+ else
+ r = sprintf(str + len, len > 0 ? " %u" : "%u", range_start);
+ assert_se(r > 0);
+ len += r;
+ }
+
+ if (in_range) {
+ if (!GREEDY_REALLOC(str, len + 2 + 2 * DECIMAL_STR_MAX(int)))
+ return NULL;
+
+ if (range_end > range_start)
+ r = sprintf(str + len, len > 0 ? " %u-%u" : "%u-%u", range_start, range_end);
+ else
+ r = sprintf(str + len, len > 0 ? " %u" : "%u", range_start);
+ assert_se(r > 0);
+ }
+
+ return TAKE_PTR(str) ?: strdup("");
+}
+
+int cpu_set_realloc(CPUSet *cpu_set, unsigned ncpus) {
+ size_t need;
+
+ assert(cpu_set);
+
+ need = CPU_ALLOC_SIZE(ncpus);
+ if (need > cpu_set->allocated) {
+ cpu_set_t *t;
+
+ t = realloc(cpu_set->set, need);
+ if (!t)
+ return -ENOMEM;
+
+ memzero((uint8_t*) t + cpu_set->allocated, need - cpu_set->allocated);
+
+ cpu_set->set = t;
+ cpu_set->allocated = need;
+ }
+
+ return 0;
+}
+
+int cpu_set_add(CPUSet *cpu_set, unsigned cpu) {
+ int r;
+
+ if (cpu >= 8192)
+ /* As of kernel 5.1, CONFIG_NR_CPUS can be set to 8192 on PowerPC */
+ return -ERANGE;
+
+ r = cpu_set_realloc(cpu_set, cpu + 1);
+ if (r < 0)
+ return r;
+
+ CPU_SET_S(cpu, cpu_set->allocated, cpu_set->set);
+ return 0;
+}
+
+int cpu_set_add_all(CPUSet *a, const CPUSet *b) {
+ int r;
+
+ /* Do this backwards, so if we fail, we fail before changing anything. */
+ for (unsigned cpu_p1 = b->allocated * 8; cpu_p1 > 0; cpu_p1--)
+ if (CPU_ISSET_S(cpu_p1 - 1, b->allocated, b->set)) {
+ r = cpu_set_add(a, cpu_p1 - 1);
+ if (r < 0)
+ return r;
+ }
+
+ return 1;
+}
+
+int parse_cpu_set_full(
+ const char *rvalue,
+ CPUSet *cpu_set,
+ bool warn,
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue) {
+
+ _cleanup_(cpu_set_reset) CPUSet c = {};
+ const char *p = ASSERT_PTR(rvalue);
+
+ assert(cpu_set);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ unsigned cpu_lower, cpu_upper;
+ int r;
+
+ r = extract_first_word(&p, &word, WHITESPACE ",", EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return warn ? log_oom() : -ENOMEM;
+ if (r < 0)
+ return warn ? log_syntax(unit, LOG_ERR, filename, line, r, "Invalid value for %s: %s", lvalue, rvalue) : r;
+ if (r == 0)
+ break;
+
+ r = parse_range(word, &cpu_lower, &cpu_upper);
+ if (r < 0)
+ return warn ? log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse CPU affinity '%s'", word) : r;
+
+ if (cpu_lower > cpu_upper) {
+ if (warn)
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Range '%s' is invalid, %u > %u, ignoring.",
+ word, cpu_lower, cpu_upper);
+
+ /* Make sure something is allocated, to distinguish this from the empty case */
+ r = cpu_set_realloc(&c, 1);
+ if (r < 0)
+ return r;
+ }
+
+ for (unsigned cpu_p1 = MIN(cpu_upper, UINT_MAX-1) + 1; cpu_p1 > cpu_lower; cpu_p1--) {
+ r = cpu_set_add(&c, cpu_p1 - 1);
+ if (r < 0)
+ return warn ? log_syntax(unit, LOG_ERR, filename, line, r,
+ "Cannot add CPU %u to set: %m", cpu_p1 - 1) : r;
+ }
+ }
+
+ *cpu_set = TAKE_STRUCT(c);
+
+ return 0;
+}
+
+int parse_cpu_set_extend(
+ const char *rvalue,
+ CPUSet *old,
+ bool warn,
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue) {
+
+ _cleanup_(cpu_set_reset) CPUSet cpuset = {};
+ int r;
+
+ assert(old);
+
+ r = parse_cpu_set_full(rvalue, &cpuset, true, unit, filename, line, lvalue);
+ if (r < 0)
+ return r;
+
+ if (!cpuset.set) {
+ /* An empty assignment resets the CPU list */
+ cpu_set_reset(old);
+ return 0;
+ }
+
+ if (!old->set) {
+ *old = TAKE_STRUCT(cpuset);
+ return 1;
+ }
+
+ return cpu_set_add_all(old, &cpuset);
+}
+
+int cpus_in_affinity_mask(void) {
+ size_t n = 16;
+ int r;
+
+ for (;;) {
+ cpu_set_t *c;
+
+ c = CPU_ALLOC(n);
+ if (!c)
+ return -ENOMEM;
+
+ if (sched_getaffinity(0, CPU_ALLOC_SIZE(n), c) >= 0) {
+ int k;
+
+ k = CPU_COUNT_S(CPU_ALLOC_SIZE(n), c);
+ CPU_FREE(c);
+
+ if (k <= 0)
+ return -EINVAL;
+
+ return k;
+ }
+
+ r = -errno;
+ CPU_FREE(c);
+
+ if (r != -EINVAL)
+ return r;
+ if (n > SIZE_MAX/2)
+ return -ENOMEM;
+ n *= 2;
+ }
+}
+
+int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated) {
+ uint8_t *out;
+
+ assert(set);
+ assert(ret);
+
+ out = new0(uint8_t, set->allocated);
+ if (!out)
+ return -ENOMEM;
+
+ for (unsigned cpu = 0; cpu < set->allocated * 8; cpu++)
+ if (CPU_ISSET_S(cpu, set->allocated, set->set))
+ out[cpu / 8] |= 1u << (cpu % 8);
+
+ *ret = out;
+ *allocated = set->allocated;
+ return 0;
+}
+
+int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set) {
+ _cleanup_(cpu_set_reset) CPUSet s = {};
+ int r;
+
+ assert(bits);
+ assert(set);
+
+ for (unsigned cpu = size * 8; cpu > 0; cpu--)
+ if (bits[(cpu - 1) / 8] & (1u << ((cpu - 1) % 8))) {
+ r = cpu_set_add(&s, cpu - 1);
+ if (r < 0)
+ return r;
+ }
+
+ *set = TAKE_STRUCT(s);
+ return 0;
+}
diff --git a/src/shared/cpu-set-util.h b/src/shared/cpu-set-util.h
new file mode 100644
index 0000000..3c63a58
--- /dev/null
+++ b/src/shared/cpu-set-util.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sched.h>
+
+#include "macro.h"
+#include "missing_syscall.h"
+
+/* This wraps the libc interface with a variable to keep the allocated size. */
+typedef struct CPUSet {
+ cpu_set_t *set;
+ size_t allocated; /* in bytes */
+} CPUSet;
+
+static inline void cpu_set_reset(CPUSet *a) {
+ assert((a->allocated > 0) == !!a->set);
+ if (a->set)
+ CPU_FREE(a->set);
+ *a = (CPUSet) {};
+}
+
+int cpu_set_add_all(CPUSet *a, const CPUSet *b);
+int cpu_set_add(CPUSet *a, unsigned cpu);
+
+char* cpu_set_to_string(const CPUSet *a);
+char *cpu_set_to_range_string(const CPUSet *a);
+int cpu_set_realloc(CPUSet *cpu_set, unsigned ncpus);
+
+int parse_cpu_set_full(
+ const char *rvalue,
+ CPUSet *cpu_set,
+ bool warn,
+ const char *unit,
+ const char *filename, unsigned line,
+ const char *lvalue);
+int parse_cpu_set_extend(
+ const char *rvalue,
+ CPUSet *old,
+ bool warn,
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue);
+
+static inline int parse_cpu_set(const char *rvalue, CPUSet *cpu_set){
+ return parse_cpu_set_full(rvalue, cpu_set, false, NULL, NULL, 0, NULL);
+}
+
+int cpu_set_to_dbus(const CPUSet *set, uint8_t **ret, size_t *allocated);
+int cpu_set_from_dbus(const uint8_t *bits, size_t size, CPUSet *set);
+
+int cpus_in_affinity_mask(void);
diff --git a/src/shared/creds-util.c b/src/shared/creds-util.c
new file mode 100644
index 0000000..7cc8889
--- /dev/null
+++ b/src/shared/creds-util.c
@@ -0,0 +1,1395 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/file.h>
+
+#if HAVE_OPENSSL
+#include <openssl/err.h>
+#endif
+
+#include "sd-id128.h"
+
+#include "blockdev-util.h"
+#include "capability-util.h"
+#include "chattr-util.h"
+#include "constants.h"
+#include "creds-util.h"
+#include "efi-api.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "io-util.h"
+#include "memory-util.h"
+#include "mkdir.h"
+#include "openssl-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "sparse-endian.h"
+#include "stat-util.h"
+#include "tpm2-util.h"
+#include "virt.h"
+
+#define PUBLIC_KEY_MAX (UINT32_C(1024) * UINT32_C(1024))
+
+bool credential_name_valid(const char *s) {
+ /* We want that credential names are both valid in filenames (since that's our primary way to pass
+ * them around) and as fdnames (which is how we might want to pass them around eventually) */
+ return filename_is_valid(s) && fdname_is_valid(s);
+}
+
+bool credential_glob_valid(const char *s) {
+ const char *e, *a;
+ size_t n;
+
+ /* Checks if a credential glob expression is valid. Note that this is more restrictive than
+ * fnmatch()! We only allow trailing asterisk matches for now (simply because we want some freedom
+ * with automatically extending the pattern in a systematic way to cover for unit instances getting
+ * per-instance credentials or similar. Moreover, credential globbing expressions are also more
+ * restrictive then credential names: we don't allow *, ?, [, ] in them (except for the asterisk
+ * match at the end of the string), simply to not allow ambiguity. After all, we want the flexibility
+ * to one day add full globbing should the need arise. */
+
+ if (isempty(s))
+ return false;
+
+ /* Find first glob (or NUL byte) */
+ n = strcspn(s, "*?[]");
+ e = s + n;
+
+ /* For now, only allow asterisk wildcards, and only at the end of the string. If it's anything else, refuse. */
+ if (isempty(e))
+ return credential_name_valid(s);
+
+ if (!streq(e, "*")) /* only allow trailing "*", no other globs */
+ return false;
+
+ if (n == 0) /* Explicitly allow the complete wildcard. */
+ return true;
+
+ if (n > NAME_MAX + strlen(e)) /* before we make a copy on the stack, let's check this is not overly large */
+ return false;
+
+ /* Make a copy of the string without the '*' suffix */
+ a = strndupa_safe(s, n);
+
+ return credential_name_valid(a);
+}
+
+static int get_credentials_dir_internal(const char *envvar, const char **ret) {
+ const char *e;
+
+ assert(ret);
+
+ e = secure_getenv(envvar);
+ if (!e)
+ return -ENXIO;
+
+ if (!path_is_absolute(e) || !path_is_normalized(e))
+ return -EINVAL;
+
+ *ret = e;
+ return 0;
+}
+
+int get_credentials_dir(const char **ret) {
+ return get_credentials_dir_internal("CREDENTIALS_DIRECTORY", ret);
+}
+
+int get_encrypted_credentials_dir(const char **ret) {
+ return get_credentials_dir_internal("ENCRYPTED_CREDENTIALS_DIRECTORY", ret);
+}
+
+int read_credential(const char *name, void **ret, size_t *ret_size) {
+ _cleanup_free_ char *fn = NULL;
+ const char *d;
+ int r;
+
+ assert(ret);
+
+ if (!credential_name_valid(name))
+ return -EINVAL;
+
+ r = get_credentials_dir(&d);
+ if (r < 0)
+ return r;
+
+ fn = path_join(d, name);
+ if (!fn)
+ return -ENOMEM;
+
+ return read_full_file_full(
+ AT_FDCWD, fn,
+ UINT64_MAX, SIZE_MAX,
+ READ_FULL_FILE_SECURE,
+ NULL,
+ (char**) ret, ret_size);
+}
+
+int read_credential_with_decryption(const char *name, void **ret, size_t *ret_size) {
+ _cleanup_(erase_and_freep) void *data = NULL;
+ _cleanup_free_ char *fn = NULL;
+ size_t sz = 0;
+ const char *d;
+ int r;
+
+ assert(ret);
+
+ /* Just like read_credential() but will also look for encrypted credentials. Note that services only
+ * receive decrypted credentials, hence use read_credential() for those. This helper here is for
+ * generators, i.e. code that runs outside of service context, and thus has no decrypted credentials
+ * yet.
+ *
+ * Note that read_credential_harder_and_warn() logs on its own, while read_credential() does not!
+ * (It's a lot more complex and error prone given its TPM2 connectivity, and is generally called from
+ * generators only where logging is OK).
+ *
+ * Error handling is also a bit different: if we can't find a credential we'll return 0 and NULL
+ * pointers/zero size, rather than -ENXIO/-ENOENT. */
+
+ if (!credential_name_valid(name))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid credential name: %s", name);
+
+ r = read_credential(name, ret, ret_size);
+ if (r >= 0)
+ return 1; /* found */
+ if (!IN_SET(r, -ENXIO, -ENOENT))
+ return log_error_errno(r, "Failed read unencrypted credential '%s': %m", name);
+
+ r = get_encrypted_credentials_dir(&d);
+ if (r == -ENXIO)
+ goto not_found;
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine encrypted credentials directory: %m");
+
+ fn = path_join(d, name);
+ if (!fn)
+ return log_oom();
+
+ r = read_full_file_full(
+ AT_FDCWD, fn,
+ UINT64_MAX, SIZE_MAX,
+ READ_FULL_FILE_SECURE,
+ NULL,
+ (char**) &data, &sz);
+ if (r == -ENOENT)
+ goto not_found;
+ if (r < 0)
+ return log_error_errno(r, "Failed to read encrypted credential data: %m");
+
+ r = decrypt_credential_and_warn(
+ name,
+ now(CLOCK_REALTIME),
+ /* tpm2_device = */ NULL,
+ /* tpm2_signature_path = */ NULL,
+ data,
+ sz,
+ ret,
+ ret_size);
+ if (r < 0)
+ return r;
+
+ return 1; /* found */
+
+not_found:
+ *ret = NULL;
+
+ if (ret_size)
+ *ret_size = 0;
+
+ return 0; /* not found */
+}
+
+int read_credential_strings_many_internal(
+ const char *first_name, char **first_value,
+ ...) {
+
+ _cleanup_free_ void *b = NULL;
+ int r, ret = 0;
+
+ /* Reads a bunch of credentials into the specified buffers. If the specified buffers are already
+ * non-NULL frees them if a credential is found. Only supports string-based credentials
+ * (i.e. refuses embedded NUL bytes).
+ *
+ * 0 is returned when some or all credentials are missing.
+ */
+
+ if (!first_name)
+ return 0;
+
+ r = read_credential(first_name, &b, NULL);
+ if (r == -ENXIO) /* No creds passed at all? Bail immediately. */
+ return 0;
+ if (r < 0) {
+ if (r != -ENOENT)
+ ret = r;
+ } else
+ free_and_replace(*first_value, b);
+
+ va_list ap;
+ va_start(ap, first_value);
+
+ for (;;) {
+ _cleanup_free_ void *bb = NULL;
+ const char *name;
+ char **value;
+
+ name = va_arg(ap, const char *);
+ if (!name)
+ break;
+
+ value = va_arg(ap, char **);
+ if (*value)
+ continue;
+
+ r = read_credential(name, &bb, NULL);
+ if (r < 0) {
+ if (ret >= 0 && r != -ENOENT)
+ ret = r;
+ } else
+ free_and_replace(*value, bb);
+ }
+
+ va_end(ap);
+ return ret;
+}
+
+int read_credential_bool(const char *name) {
+ _cleanup_free_ void *data = NULL;
+ int r;
+
+ r = read_credential(name, &data, NULL);
+ if (r < 0)
+ return IN_SET(r, -ENXIO, -ENOENT) ? 0 : r;
+
+ return parse_boolean(data);
+}
+
+int get_credential_user_password(const char *username, char **ret_password, bool *ret_is_hashed) {
+ _cleanup_(erase_and_freep) char *creds_password = NULL;
+ _cleanup_free_ char *cn = NULL;
+ int r;
+
+ /* Try to pick up the password for this account via the credentials logic */
+ cn = strjoin("passwd.hashed-password.", username);
+ if (!cn)
+ return -ENOMEM;
+
+ r = read_credential(cn, (void**) &creds_password, NULL);
+ if (r == -ENOENT) {
+ free(cn);
+ cn = strjoin("passwd.plaintext-password.", username);
+ if (!cn)
+ return -ENOMEM;
+
+ r = read_credential(cn, (void**) &creds_password, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Couldn't read credential '%s', ignoring: %m", cn);
+ else
+ *ret_is_hashed = false;
+ } else if (r < 0)
+ log_debug_errno(r, "Couldn't read credential '%s', ignoring: %m", cn);
+ else
+ *ret_is_hashed = true;
+
+ *ret_password = TAKE_PTR(creds_password);
+
+ return r;
+}
+
+#if HAVE_OPENSSL
+
+#define CREDENTIAL_HOST_SECRET_SIZE 4096
+
+static const sd_id128_t credential_app_id =
+ SD_ID128_MAKE(d3,ac,ec,ba,0d,ad,4c,df,b8,c9,38,15,28,93,6c,58);
+
+struct credential_host_secret_format {
+ /* The hashed machine ID of the machine this belongs to. Why? We want to ensure that each machine
+ * gets its own secret, even if people forget to flush out this secret file. Hence we bind it to the
+ * machine ID, for which there's hopefully a better chance it will be flushed out. We use a hashed
+ * machine ID instead of the literal one, because it's trivial to, and it might be a good idea not
+ * being able to directly associate a secret key file with a host. */
+ sd_id128_t machine_id;
+
+ /* The actual secret key */
+ uint8_t data[CREDENTIAL_HOST_SECRET_SIZE];
+} _packed_;
+
+static void warn_not_encrypted(int fd, CredentialSecretFlags flags, const char *dirname, const char *filename) {
+ int r;
+
+ assert(fd >= 0);
+ assert(dirname);
+ assert(filename);
+
+ if (!FLAGS_SET(flags, CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED))
+ return;
+
+ r = fd_is_encrypted(fd);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determine if credential secret file '%s/%s' is encrypted.",
+ dirname, filename);
+ else if (r == 0)
+ log_warning("Credential secret file '%s/%s' is not located on encrypted media, using anyway.",
+ dirname, filename);
+}
+
+static int make_credential_host_secret(
+ int dfd,
+ const sd_id128_t machine_id,
+ CredentialSecretFlags flags,
+ const char *dirname,
+ const char *fn,
+ void **ret_data,
+ size_t *ret_size) {
+
+ _cleanup_free_ char *t = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(dfd >= 0);
+ assert(fn);
+
+ /* For non-root users creating a temporary file using the openat(2) over "." will fail later, in the
+ * linkat(2) step at the end. The reason is that linkat(2) requires the CAP_DAC_READ_SEARCH
+ * capability when it uses the AT_EMPTY_PATH flag. */
+ if (have_effective_cap(CAP_DAC_READ_SEARCH) > 0) {
+ fd = openat(dfd, ".", O_CLOEXEC|O_WRONLY|O_TMPFILE, 0400);
+ if (fd < 0)
+ log_debug_errno(errno, "Failed to create temporary credential file with O_TMPFILE, proceeding without: %m");
+ }
+ if (fd < 0) {
+ if (asprintf(&t, "credential.secret.%016" PRIx64, random_u64()) < 0)
+ return -ENOMEM;
+
+ fd = openat(dfd, t, O_CLOEXEC|O_WRONLY|O_CREAT|O_EXCL|O_NOFOLLOW, 0400);
+ if (fd < 0)
+ return -errno;
+ }
+
+ r = chattr_secret(fd, 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to set file attributes for secrets file, ignoring: %m");
+
+ struct credential_host_secret_format buf = {
+ .machine_id = machine_id,
+ };
+
+ CLEANUP_ERASE(buf);
+
+ r = crypto_random_bytes(buf.data, sizeof(buf.data));
+ if (r < 0)
+ goto fail;
+
+ r = loop_write(fd, &buf, sizeof(buf));
+ if (r < 0)
+ goto fail;
+
+ if (fsync(fd) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ warn_not_encrypted(fd, flags, dirname, fn);
+
+ if (t) {
+ r = rename_noreplace(dfd, t, dfd, fn);
+ if (r < 0)
+ goto fail;
+
+ t = mfree(t);
+ } else if (linkat(fd, "", dfd, fn, AT_EMPTY_PATH) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (fsync(dfd) < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ if (ret_data) {
+ void *copy;
+
+ copy = memdup(buf.data, sizeof(buf.data));
+ if (!copy) {
+ r = -ENOMEM;
+ goto fail;
+ }
+
+ *ret_data = copy;
+ }
+
+ if (ret_size)
+ *ret_size = sizeof(buf.data);
+
+ return 0;
+
+fail:
+ if (t && unlinkat(dfd, t, 0) < 0)
+ log_debug_errno(errno, "Failed to remove temporary credential key: %m");
+
+ return r;
+}
+
+int get_credential_host_secret(CredentialSecretFlags flags, void **ret, size_t *ret_size) {
+ _cleanup_free_ char *_dirname = NULL, *_filename = NULL;
+ _cleanup_close_ int dfd = -EBADF;
+ sd_id128_t machine_id;
+ const char *dirname, *filename;
+ int r;
+
+ r = sd_id128_get_machine_app_specific(credential_app_id, &machine_id);
+ if (r < 0)
+ return r;
+
+ const char *e = secure_getenv("SYSTEMD_CREDENTIAL_SECRET");
+ if (e) {
+ if (!path_is_normalized(e))
+ return -EINVAL;
+ if (!path_is_absolute(e))
+ return -EINVAL;
+
+ r = path_extract_directory(e, &_dirname);
+ if (r < 0)
+ return r;
+
+ r = path_extract_filename(e, &_filename);
+ if (r < 0)
+ return r;
+
+ dirname = _dirname;
+ filename = _filename;
+ } else {
+ dirname = "/var/lib/systemd";
+ filename = "credential.secret";
+ }
+
+ assert(dirname);
+ assert(filename);
+
+ mkdir_parents(dirname, 0755);
+ dfd = open_mkdir_at(AT_FDCWD, dirname, O_CLOEXEC, 0755);
+ if (dfd < 0)
+ return log_debug_errno(dfd, "Failed to create or open directory '%s': %m", dirname);
+
+ if (FLAGS_SET(flags, CREDENTIAL_SECRET_FAIL_ON_TEMPORARY_FS)) {
+ r = fd_is_temporary_fs(dfd);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check directory '%s': %m", dirname);
+ if (r > 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
+ "Directory '%s' is on a temporary file system, refusing.", dirname);
+ }
+
+ for (unsigned attempt = 0;; attempt++) {
+ _cleanup_(erase_and_freep) struct credential_host_secret_format *f = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ size_t l = 0;
+ ssize_t n = 0;
+ struct stat st;
+
+ if (attempt >= 3) /* Somebody is playing games with us */
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO),
+ "All attempts to create secret store in %s failed.", dirname);
+
+ fd = openat(dfd, filename, O_CLOEXEC|O_RDONLY|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0) {
+ if (errno != ENOENT || !FLAGS_SET(flags, CREDENTIAL_SECRET_GENERATE))
+ return log_debug_errno(errno,
+ "Failed to open %s/%s: %m", dirname, filename);
+
+
+ r = make_credential_host_secret(dfd, machine_id, flags, dirname, filename, ret, ret_size);
+ if (r == -EEXIST) {
+ log_debug_errno(r, "Credential secret %s/%s appeared while we were creating it, rereading.",
+ dirname, filename);
+ continue;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create credential secret %s/%s: %m",
+ dirname, filename);
+ return 0;
+ }
+
+ if (fstat(fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to stat %s/%s: %m", dirname, filename);
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return log_debug_errno(r, "%s/%s is not a regular file: %m", dirname, filename);
+ if (st.st_nlink == 0) /* Deleted by now, try again */
+ continue;
+ if (st.st_nlink > 1)
+ /* Our deletion check won't work if hardlinked somewhere else */
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
+ "%s/%s has too many links, refusing.",
+ dirname, filename);
+ if ((st.st_mode & 07777) != 0400)
+ /* Don't use file if not 0400 access mode */
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
+ "%s/%s has permissive access mode, refusing.",
+ dirname, filename);
+ l = st.st_size;
+ if (l < offsetof(struct credential_host_secret_format, data) + 1)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "%s/%s is too small, refusing.", dirname, filename);
+ if (l > 16*1024*1024)
+ return log_debug_errno(SYNTHETIC_ERRNO(E2BIG),
+ "%s/%s is too big, refusing.", dirname, filename);
+
+ f = malloc(l+1);
+ if (!f)
+ return log_oom_debug();
+
+ n = read(fd, f, l+1);
+ if (n < 0)
+ return log_debug_errno(errno,
+ "Failed to read %s/%s: %m", dirname, filename);
+ if ((size_t) n != l) /* What? The size changed? */
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to read %s/%s: %m", dirname, filename);
+
+ if (sd_id128_equal(machine_id, f->machine_id)) {
+ size_t sz;
+
+ warn_not_encrypted(fd, flags, dirname, filename);
+
+ sz = l - offsetof(struct credential_host_secret_format, data);
+ assert(sz > 0);
+
+ if (ret) {
+ void *copy;
+
+ assert(sz <= sizeof(f->data)); /* Ensure we don't read past f->data bounds */
+
+ copy = memdup(f->data, sz);
+ if (!copy)
+ return log_oom_debug();
+
+ *ret = copy;
+ }
+
+ if (ret_size)
+ *ret_size = sz;
+
+ return 0;
+ }
+
+ /* Hmm, this secret is from somewhere else. Let's delete the file. Let's first acquire a lock
+ * to ensure we are the only ones accessing the file while we delete it. */
+
+ if (flock(fd, LOCK_EX) < 0)
+ return log_debug_errno(errno,
+ "Failed to flock %s/%s: %m", dirname, filename);
+
+ /* Before we delete it check that the file is still linked into the file system */
+ if (fstat(fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to stat %s/%s: %m", dirname, filename);
+ if (st.st_nlink == 0) /* Already deleted by now? */
+ continue;
+ if (st.st_nlink != 1) /* Safety check, someone is playing games with us */
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
+ "%s/%s unexpectedly has too many links.",
+ dirname, filename);
+ if (unlinkat(dfd, filename, 0) < 0)
+ return log_debug_errno(errno, "Failed to unlink %s/%s: %m", dirname, filename);
+
+ /* And now try again */
+ }
+}
+
+/* Construction is like this:
+ *
+ * A symmetric encryption key is derived from:
+ *
+ * 1. Either the "host" key (a key stored in /var/lib/credential.secret)
+ *
+ * 2. A key generated by letting the TPM2 calculate an HMAC hash of some nonce we pass to it, keyed
+ * by a key derived from its internal seed key.
+ *
+ * 3. The concatenation of the above.
+ *
+ * 4. Or a fixed "empty" key. This will not provide confidentiality or authenticity, of course, but is
+ * useful to encode credentials for the initrd on TPM-less systems, where we simply have no better
+ * concept to bind things to. Note that decryption of a key set up like this will be refused on
+ * systems that have a TPM and have SecureBoot enabled.
+ *
+ * The above is hashed with SHA256 which is then used as encryption key for AES256-GCM. The encrypted
+ * credential is a short (unencrypted) header describing which of the three keys to use, the IV to use for
+ * AES256-GCM and some more meta information (sizes of certain objects) that is strictly speaking redundant,
+ * but kinda nice to have since we can have a more generic parser. If the TPM2 key is used this is followed
+ * by another (unencrypted) header, with information about the TPM2 policy used (specifically: the PCR mask
+ * to bind against, and a hash of the resulting policy — the latter being redundant, but speeding up things a
+ * bit, since we can more quickly refuse PCR state), followed by a sealed/exported TPM2 HMAC key. This is
+ * then followed by the encrypted data, which begins with a metadata header (which contains validity
+ * timestamps as well as the credential name), followed by the actual credential payload. The file ends in
+ * the AES256-GCM tag. To make things simple, the AES256-GCM AAD covers the main and the TPM2 header in
+ * full. This means the whole file is either protected by AAD, or is ciphertext, or is the tag. No
+ * unprotected data is included.
+ */
+
+struct _packed_ encrypted_credential_header {
+ sd_id128_t id;
+ le32_t key_size;
+ le32_t block_size;
+ le32_t iv_size;
+ le32_t tag_size;
+ uint8_t iv[];
+ /* Followed by NUL bytes until next 8 byte boundary */
+};
+
+struct _packed_ tpm2_credential_header {
+ le64_t pcr_mask; /* Note that the spec for PC Clients only mandates 24 PCRs, and that's what systems
+ * generally have. But keep the door open for more. */
+ le16_t pcr_bank; /* For now, either TPM2_ALG_SHA256 or TPM2_ALG_SHA1 */
+ le16_t primary_alg; /* Primary key algorithm (either TPM2_ALG_RSA or TPM2_ALG_ECC for now) */
+ le32_t blob_size;
+ le32_t policy_hash_size;
+ uint8_t policy_hash_and_blob[];
+ /* Followed by NUL bytes until next 8 byte boundary */
+};
+
+struct _packed_ tpm2_public_key_credential_header {
+ le64_t pcr_mask; /* PCRs used for the public key PCR policy (usually just PCR 11, i.e. the unified kernel) */
+ le32_t size; /* Size of DER public key */
+ uint8_t data[]; /* DER public key */
+ /* Followed by NUL bytes until next 8 byte boundary */
+};
+
+struct _packed_ metadata_credential_header {
+ le64_t timestamp;
+ le64_t not_after;
+ le32_t name_size;
+ char name[];
+ /* Followed by NUL bytes until next 8 byte boundary */
+};
+
+/* Some generic limit for parts of the encrypted credential for which we don't know the right size ahead of
+ * time, but where we are really sure it won't be larger than this. Should be larger than any possible IV,
+ * padding, tag size and so on. This is purely used for early filtering out of invalid sizes. */
+#define CREDENTIAL_FIELD_SIZE_MAX (16U*1024U)
+
+static int sha256_hash_host_and_tpm2_key(
+ const void *host_key,
+ size_t host_key_size,
+ const void *tpm2_key,
+ size_t tpm2_key_size,
+ uint8_t ret[static SHA256_DIGEST_LENGTH]) {
+
+ _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *md = NULL;
+ unsigned l;
+
+ assert(host_key_size == 0 || host_key);
+ assert(tpm2_key_size == 0 || tpm2_key);
+ assert(ret);
+
+ /* Combines the host key and the TPM2 HMAC hash into a SHA256 hash value we'll use as symmetric encryption key. */
+
+ md = EVP_MD_CTX_new();
+ if (!md)
+ return log_oom();
+
+ if (EVP_DigestInit_ex(md, EVP_sha256(), NULL) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initial SHA256 context.");
+
+ if (host_key && EVP_DigestUpdate(md, host_key, host_key_size) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to hash host key.");
+
+ if (tpm2_key && EVP_DigestUpdate(md, tpm2_key, tpm2_key_size) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to hash TPM2 key.");
+
+ assert(EVP_MD_CTX_size(md) == SHA256_DIGEST_LENGTH);
+
+ if (EVP_DigestFinal_ex(md, ret, &l) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize SHA256 hash.");
+
+ assert(l == SHA256_DIGEST_LENGTH);
+ return 0;
+}
+
+int encrypt_credential_and_warn(
+ sd_id128_t with_key,
+ const char *name,
+ usec_t timestamp,
+ usec_t not_after,
+ const char *tpm2_device,
+ uint32_t tpm2_hash_pcr_mask,
+ const char *tpm2_pubkey_path,
+ uint32_t tpm2_pubkey_pcr_mask,
+ const void *input,
+ size_t input_size,
+ void **ret,
+ size_t *ret_size) {
+
+ _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL;
+ _cleanup_(erase_and_freep) void *host_key = NULL, *tpm2_key = NULL;
+ size_t host_key_size = 0, tpm2_key_size = 0, tpm2_blob_size = 0, tpm2_policy_hash_size = 0, output_size, p, ml;
+ _cleanup_free_ void *tpm2_blob = NULL, *tpm2_policy_hash = NULL, *iv = NULL, *output = NULL;
+ _cleanup_free_ struct metadata_credential_header *m = NULL;
+ uint16_t tpm2_pcr_bank = 0, tpm2_primary_alg = 0;
+ struct encrypted_credential_header *h;
+ int ksz, bsz, ivsz, tsz, added, r;
+ _cleanup_free_ void *pubkey = NULL;
+ size_t pubkey_size = 0;
+ uint8_t md[SHA256_DIGEST_LENGTH];
+ const EVP_CIPHER *cc;
+ sd_id128_t id;
+
+ assert(input || input_size == 0);
+ assert(ret);
+ assert(ret_size);
+
+ if (!sd_id128_in_set(with_key,
+ _CRED_AUTO,
+ _CRED_AUTO_INITRD,
+ CRED_AES256_GCM_BY_HOST,
+ CRED_AES256_GCM_BY_TPM2_HMAC,
+ CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK,
+ CRED_AES256_GCM_BY_TPM2_ABSENT))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid key type: " SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(with_key));
+
+ if (name && !credential_name_valid(name))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid credential name: %s", name);
+
+ if (not_after != USEC_INFINITY && timestamp != USEC_INFINITY && not_after < timestamp)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential is invalidated before it is valid (" USEC_FMT " < " USEC_FMT ").", not_after, timestamp);
+
+ if (DEBUG_LOGGING) {
+ char buf[FORMAT_TIMESTAMP_MAX];
+
+ if (name)
+ log_debug("Including credential name '%s' in encrypted credential.", name);
+ if (timestamp != USEC_INFINITY)
+ log_debug("Including timestamp '%s' in encrypted credential.", format_timestamp(buf, sizeof(buf), timestamp));
+ if (not_after != USEC_INFINITY)
+ log_debug("Including not-after timestamp '%s' in encrypted credential.", format_timestamp(buf, sizeof(buf), not_after));
+ }
+
+ if (sd_id128_in_set(with_key,
+ _CRED_AUTO,
+ CRED_AES256_GCM_BY_HOST,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK)) {
+
+ r = get_credential_host_secret(
+ CREDENTIAL_SECRET_GENERATE|
+ CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED|
+ (sd_id128_equal(with_key, _CRED_AUTO) ? CREDENTIAL_SECRET_FAIL_ON_TEMPORARY_FS : 0),
+ &host_key,
+ &host_key_size);
+ if (r == -ENOMEDIUM && sd_id128_equal(with_key, _CRED_AUTO))
+ log_debug_errno(r, "Credential host secret location on temporary file system, not using.");
+ else if (r < 0)
+ return log_error_errno(r, "Failed to determine local credential host secret: %m");
+ }
+
+#if HAVE_TPM2
+ bool try_tpm2;
+ if (sd_id128_in_set(with_key, _CRED_AUTO, _CRED_AUTO_INITRD)) {
+ /* If automatic mode is selected lets see if a TPM2 it is present. If we are running in a
+ * container tpm2_support will detect this, and will return a different flag combination of
+ * TPM2_SUPPORT_FULL, effectively skipping the use of TPM2 when inside one. */
+
+ try_tpm2 = tpm2_support() == TPM2_SUPPORT_FULL;
+ if (!try_tpm2)
+ log_debug("System lacks TPM2 support or running in a container, not attempting to use TPM2.");
+ } else
+ try_tpm2 = sd_id128_in_set(with_key,
+ CRED_AES256_GCM_BY_TPM2_HMAC,
+ CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK);
+
+ if (try_tpm2) {
+ if (sd_id128_in_set(with_key,
+ _CRED_AUTO,
+ _CRED_AUTO_INITRD,
+ CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK,
+ CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK)) {
+
+ /* Load public key for PCR policies, if one is specified, or explicitly requested */
+
+ r = tpm2_load_pcr_public_key(tpm2_pubkey_path, &pubkey, &pubkey_size);
+ if (r < 0) {
+ if (tpm2_pubkey_path || r != -ENOENT || !sd_id128_in_set(with_key, _CRED_AUTO, _CRED_AUTO_INITRD))
+ return log_error_errno(r, "Failed read TPM PCR public key: %m");
+
+ log_debug_errno(r, "Failed to read TPM2 PCR public key, proceeding without: %m");
+ }
+ }
+
+ if (!pubkey)
+ tpm2_pubkey_pcr_mask = 0;
+
+ _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL;
+ r = tpm2_context_new(tpm2_device, &tpm2_context);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create TPM2 context: %m");
+
+ r = tpm2_get_best_pcr_bank(tpm2_context, tpm2_hash_pcr_mask | tpm2_pubkey_pcr_mask, &tpm2_pcr_bank);
+ if (r < 0)
+ return log_error_errno(r, "Could not find best pcr bank: %m");
+
+ TPML_PCR_SELECTION tpm2_hash_pcr_selection;
+ tpm2_tpml_pcr_selection_from_mask(tpm2_hash_pcr_mask, tpm2_pcr_bank, &tpm2_hash_pcr_selection);
+
+ _cleanup_free_ Tpm2PCRValue *tpm2_hash_pcr_values = NULL;
+ size_t tpm2_n_hash_pcr_values;
+ r = tpm2_pcr_read(tpm2_context, &tpm2_hash_pcr_selection, &tpm2_hash_pcr_values, &tpm2_n_hash_pcr_values);
+ if (r < 0)
+ return log_error_errno(r, "Could not read PCR values: %m");
+
+ TPM2B_PUBLIC public;
+ if (pubkey) {
+ r = tpm2_tpm2b_public_from_pem(pubkey, pubkey_size, &public);
+ if (r < 0)
+ return log_error_errno(r, "Could not convert public key to TPM2B_PUBLIC: %m");
+ }
+
+ TPM2B_DIGEST tpm2_policy = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE);
+ r = tpm2_calculate_sealing_policy(
+ tpm2_hash_pcr_values,
+ tpm2_n_hash_pcr_values,
+ pubkey ? &public : NULL,
+ /* use_pin= */ false,
+ /* pcrlock_policy= */ NULL,
+ &tpm2_policy);
+ if (r < 0)
+ return log_error_errno(r, "Could not calculate sealing policy digest: %m");
+
+ r = tpm2_seal(tpm2_context,
+ /* seal_key_handle= */ 0,
+ &tpm2_policy,
+ /* pin= */ NULL,
+ &tpm2_key, &tpm2_key_size,
+ &tpm2_blob, &tpm2_blob_size,
+ &tpm2_primary_alg,
+ /* ret_srk_buf= */ NULL,
+ /* ret_srk_buf_size= */ NULL);
+ if (r < 0) {
+ if (sd_id128_equal(with_key, _CRED_AUTO_INITRD))
+ log_warning("TPM2 present and used, but we didn't manage to talk to it. Credential will be refused if SecureBoot is enabled.");
+ else if (!sd_id128_equal(with_key, _CRED_AUTO))
+ return log_error_errno(r, "Failed to seal to TPM2: %m");
+
+ log_notice_errno(r, "TPM2 sealing didn't work, continuing without TPM2: %m");
+ }
+
+ tpm2_policy_hash_size = tpm2_policy.size;
+ tpm2_policy_hash = malloc(tpm2_policy_hash_size);
+ if (!tpm2_policy_hash)
+ return log_oom();
+ memcpy(tpm2_policy_hash, tpm2_policy.buffer, tpm2_policy_hash_size);
+
+ assert(tpm2_blob_size <= CREDENTIAL_FIELD_SIZE_MAX);
+ assert(tpm2_policy_hash_size <= CREDENTIAL_FIELD_SIZE_MAX);
+ }
+#endif
+
+ if (sd_id128_in_set(with_key, _CRED_AUTO, _CRED_AUTO_INITRD)) {
+ /* Let's settle the key type in auto mode now. */
+
+ if (host_key && tpm2_key)
+ id = pubkey ? CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK : CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC;
+ else if (tpm2_key)
+ id = pubkey ? CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK : CRED_AES256_GCM_BY_TPM2_HMAC;
+ else if (host_key)
+ id = CRED_AES256_GCM_BY_HOST;
+ else if (sd_id128_equal(with_key, _CRED_AUTO_INITRD))
+ id = CRED_AES256_GCM_BY_TPM2_ABSENT;
+ else
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "TPM2 not available and host key located on temporary file system, no encryption key available.");
+ } else
+ id = with_key;
+
+ if (sd_id128_equal(id, CRED_AES256_GCM_BY_TPM2_ABSENT))
+ log_warning("Using a null key for encryption and signing. Confidentiality or authenticity will not be provided.");
+
+ /* Let's now take the host key and the TPM2 key and hash it together, to use as encryption key for the data */
+ r = sha256_hash_host_and_tpm2_key(host_key, host_key_size, tpm2_key, tpm2_key_size, md);
+ if (r < 0)
+ return r;
+
+ assert_se(cc = EVP_aes_256_gcm());
+
+ ksz = EVP_CIPHER_key_length(cc);
+ assert(ksz == sizeof(md));
+
+ bsz = EVP_CIPHER_block_size(cc);
+ assert(bsz > 0);
+ assert((size_t) bsz <= CREDENTIAL_FIELD_SIZE_MAX);
+
+ ivsz = EVP_CIPHER_iv_length(cc);
+ if (ivsz > 0) {
+ assert((size_t) ivsz <= CREDENTIAL_FIELD_SIZE_MAX);
+
+ iv = malloc(ivsz);
+ if (!iv)
+ return log_oom();
+
+ r = crypto_random_bytes(iv, ivsz);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquired randomized IV: %m");
+ }
+
+ tsz = 16; /* FIXME: On OpenSSL 3 there is EVP_CIPHER_CTX_get_tag_length(), until then let's hardcode this */
+
+ context = EVP_CIPHER_CTX_new();
+ if (!context)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOMEM), "Failed to allocate encryption object: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ if (EVP_EncryptInit_ex(context, cc, NULL, md, iv) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize encryption context: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ /* Just an upper estimate */
+ output_size =
+ ALIGN8(offsetof(struct encrypted_credential_header, iv) + ivsz) +
+ ALIGN8(tpm2_key ? offsetof(struct tpm2_credential_header, policy_hash_and_blob) + tpm2_blob_size + tpm2_policy_hash_size : 0) +
+ ALIGN8(pubkey ? offsetof(struct tpm2_public_key_credential_header, data) + pubkey_size : 0) +
+ ALIGN8(offsetof(struct metadata_credential_header, name) + strlen_ptr(name)) +
+ input_size + 2U * (size_t) bsz +
+ tsz;
+
+ output = malloc0(output_size);
+ if (!output)
+ return log_oom();
+
+ h = (struct encrypted_credential_header*) output;
+ h->id = id;
+ h->block_size = htole32(bsz);
+ h->key_size = htole32(ksz);
+ h->tag_size = htole32(tsz);
+ h->iv_size = htole32(ivsz);
+ memcpy(h->iv, iv, ivsz);
+
+ p = ALIGN8(offsetof(struct encrypted_credential_header, iv) + ivsz);
+
+ if (tpm2_key) {
+ struct tpm2_credential_header *t;
+
+ t = (struct tpm2_credential_header*) ((uint8_t*) output + p);
+ t->pcr_mask = htole64(tpm2_hash_pcr_mask);
+ t->pcr_bank = htole16(tpm2_pcr_bank);
+ t->primary_alg = htole16(tpm2_primary_alg);
+ t->blob_size = htole32(tpm2_blob_size);
+ t->policy_hash_size = htole32(tpm2_policy_hash_size);
+ memcpy(t->policy_hash_and_blob, tpm2_blob, tpm2_blob_size);
+ memcpy(t->policy_hash_and_blob + tpm2_blob_size, tpm2_policy_hash, tpm2_policy_hash_size);
+
+ p += ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + tpm2_blob_size + tpm2_policy_hash_size);
+ }
+
+ if (pubkey) {
+ struct tpm2_public_key_credential_header *z;
+
+ z = (struct tpm2_public_key_credential_header*) ((uint8_t*) output + p);
+ z->pcr_mask = htole64(tpm2_pubkey_pcr_mask);
+ z->size = htole32(pubkey_size);
+ memcpy(z->data, pubkey, pubkey_size);
+
+ p += ALIGN8(offsetof(struct tpm2_public_key_credential_header, data) + pubkey_size);
+ }
+
+ /* Pass the encrypted + TPM2 header as AAD */
+ if (EVP_EncryptUpdate(context, NULL, &added, output, p) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to write AAD data: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ /* Now construct the metadata header */
+ ml = strlen_ptr(name);
+ m = malloc0(ALIGN8(offsetof(struct metadata_credential_header, name) + ml));
+ if (!m)
+ return log_oom();
+
+ m->timestamp = htole64(timestamp);
+ m->not_after = htole64(not_after);
+ m->name_size = htole32(ml);
+ memcpy_safe(m->name, name, ml);
+
+ /* And encrypt the metadata header */
+ if (EVP_EncryptUpdate(context, (uint8_t*) output + p, &added, (const unsigned char*) m, ALIGN8(offsetof(struct metadata_credential_header, name) + ml)) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to encrypt metadata header: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ assert(added >= 0);
+ assert((size_t) added <= output_size - p);
+ p += added;
+
+ /* Then encrypt the plaintext */
+ if (EVP_EncryptUpdate(context, (uint8_t*) output + p, &added, input, input_size) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to encrypt data: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ assert(added >= 0);
+ assert((size_t) added <= output_size - p);
+ p += added;
+
+ /* Finalize */
+ if (EVP_EncryptFinal_ex(context, (uint8_t*) output + p, &added) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to finalize data encryption: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ assert(added >= 0);
+ assert((size_t) added <= output_size - p);
+ p += added;
+
+ assert(p <= output_size - tsz);
+
+ /* Append tag */
+ if (EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_GET_TAG, tsz, (uint8_t*) output + p) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to get tag: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ p += tsz;
+ assert(p <= output_size);
+
+ if (DEBUG_LOGGING && input_size > 0) {
+ size_t base64_size;
+
+ base64_size = DIV_ROUND_UP(p * 4, 3); /* Include base64 size increase in debug output */
+ assert(base64_size >= input_size);
+ log_debug("Input of %zu bytes grew to output of %zu bytes (+%2zu%%).", input_size, base64_size, base64_size * 100 / input_size - 100);
+ }
+
+ *ret = TAKE_PTR(output);
+ *ret_size = p;
+
+ return 0;
+}
+
+int decrypt_credential_and_warn(
+ const char *validate_name,
+ usec_t validate_timestamp,
+ const char *tpm2_device,
+ const char *tpm2_signature_path,
+ const void *input,
+ size_t input_size,
+ void **ret,
+ size_t *ret_size) {
+
+ _cleanup_(erase_and_freep) void *host_key = NULL, *tpm2_key = NULL, *plaintext = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *signature_json = NULL;
+ _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *context = NULL;
+ size_t host_key_size = 0, tpm2_key_size = 0, plaintext_size, p, hs;
+ struct encrypted_credential_header *h;
+ struct metadata_credential_header *m;
+ uint8_t md[SHA256_DIGEST_LENGTH];
+ bool with_tpm2, with_host_key, is_tpm2_absent, with_tpm2_pk;
+ const EVP_CIPHER *cc;
+ int r, added;
+
+ assert(input || input_size == 0);
+ assert(ret);
+ assert(ret_size);
+
+ h = (struct encrypted_credential_header*) input;
+
+ /* The ID must fit in, for the current and all future formats */
+ if (input_size < sizeof(h->id))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short.");
+
+ with_host_key = sd_id128_in_set(h->id, CRED_AES256_GCM_BY_HOST, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK);
+ with_tpm2_pk = sd_id128_in_set(h->id, CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK);
+ with_tpm2 = sd_id128_in_set(h->id, CRED_AES256_GCM_BY_TPM2_HMAC, CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC) || with_tpm2_pk;
+ is_tpm2_absent = sd_id128_equal(h->id, CRED_AES256_GCM_BY_TPM2_ABSENT);
+
+ if (!with_host_key && !with_tpm2 && !is_tpm2_absent)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Unknown encryption format, or corrupted data: %m");
+
+ if (with_tpm2_pk) {
+ r = tpm2_load_pcr_signature(tpm2_signature_path, &signature_json);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load pcr signature: %m");
+ }
+
+ if (is_tpm2_absent) {
+ /* So this is a credential encrypted with a zero length key. We support this to cover for the
+ * case where neither a host key not a TPM2 are available (specifically: initrd environments
+ * where the host key is not yet accessible and no TPM2 chip exists at all), to minimize
+ * different codeflow for TPM2 and non-TPM2 codepaths. Of course, credentials encoded this
+ * way offer no confidentiality nor authenticity. Because of that it's important we refuse to
+ * use them on systems that actually *do* have a TPM2 chip – if we are in SecureBoot
+ * mode. Otherwise an attacker could hand us credentials like this and we'd use them thinking
+ * they are trusted, even though they are not. */
+
+ if (efi_has_tpm2()) {
+ if (is_efi_secure_boot())
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
+ "Credential uses fixed key for fallback use when TPM2 is absent — but TPM2 is present, and SecureBoot is enabled, refusing.");
+
+ log_warning("Credential uses fixed key for use when TPM2 is absent, but TPM2 is present! Accepting anyway, since SecureBoot is disabled.");
+ } else
+ log_debug("Credential uses fixed key for use when TPM2 is absent, and TPM2 indeed is absent. Accepting.");
+ }
+
+ /* Now we know the minimum header size */
+ if (input_size < offsetof(struct encrypted_credential_header, iv))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short.");
+
+ /* Verify some basic header values */
+ if (le32toh(h->key_size) != sizeof(md))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected key size in header.");
+ if (le32toh(h->block_size) <= 0 || le32toh(h->block_size) > CREDENTIAL_FIELD_SIZE_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected block size in header.");
+ if (le32toh(h->iv_size) > CREDENTIAL_FIELD_SIZE_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "IV size too large.");
+ if (le32toh(h->tag_size) != 16) /* FIXME: On OpenSSL 3, let's verify via EVP_CIPHER_CTX_get_tag_length() */
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected tag size in header.");
+
+ /* Ensure we have space for the full header now (we don't know the size of the name hence this is a
+ * lower limit only) */
+ if (input_size <
+ ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)) +
+ ALIGN8(with_tpm2 ? offsetof(struct tpm2_credential_header, policy_hash_and_blob) : 0) +
+ ALIGN8(with_tpm2_pk ? offsetof(struct tpm2_public_key_credential_header, data) : 0) +
+ ALIGN8(offsetof(struct metadata_credential_header, name)) +
+ le32toh(h->tag_size))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short.");
+
+ p = ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size));
+
+ if (with_tpm2) {
+#if HAVE_TPM2
+ struct tpm2_credential_header* t = (struct tpm2_credential_header*) ((uint8_t*) input + p);
+ struct tpm2_public_key_credential_header *z = NULL;
+
+ if (!TPM2_PCR_MASK_VALID(t->pcr_mask))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 PCR mask out of range.");
+ if (!tpm2_hash_alg_to_string(le16toh(t->pcr_bank)))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 PCR bank invalid or not supported");
+ if (!tpm2_asym_alg_to_string(le16toh(t->primary_alg)))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 primary key algorithm invalid or not supported.");
+ if (le32toh(t->blob_size) > CREDENTIAL_FIELD_SIZE_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected TPM2 blob size.");
+ if (le32toh(t->policy_hash_size) > CREDENTIAL_FIELD_SIZE_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected TPM2 policy hash size.");
+
+ /* Ensure we have space for the full TPM2 header now (still don't know the name, and its size
+ * though, hence still just a lower limit test only) */
+ if (input_size <
+ ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)) +
+ ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + le32toh(t->blob_size) + le32toh(t->policy_hash_size)) +
+ ALIGN8(with_tpm2_pk ? offsetof(struct tpm2_public_key_credential_header, data) : 0) +
+ ALIGN8(offsetof(struct metadata_credential_header, name)) +
+ le32toh(h->tag_size))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short.");
+
+ p += ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) +
+ le32toh(t->blob_size) +
+ le32toh(t->policy_hash_size));
+
+ if (with_tpm2_pk) {
+ z = (struct tpm2_public_key_credential_header*) ((uint8_t*) input + p);
+
+ if (!TPM2_PCR_MASK_VALID(le64toh(z->pcr_mask)) || le64toh(z->pcr_mask) == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "TPM2 PCR mask out of range.");
+ if (le32toh(z->size) > PUBLIC_KEY_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected public key size.");
+
+ if (input_size <
+ ALIGN8(offsetof(struct encrypted_credential_header, iv) + le32toh(h->iv_size)) +
+ ALIGN8(offsetof(struct tpm2_credential_header, policy_hash_and_blob) + le32toh(t->blob_size) + le32toh(t->policy_hash_size)) +
+ ALIGN8(offsetof(struct tpm2_public_key_credential_header, data) + le32toh(z->size)) +
+ ALIGN8(offsetof(struct metadata_credential_header, name)) +
+ le32toh(h->tag_size))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Encrypted file too short.");
+
+ p += ALIGN8(offsetof(struct tpm2_public_key_credential_header, data) +
+ le32toh(z->size));
+ }
+
+ _cleanup_(tpm2_context_unrefp) Tpm2Context *tpm2_context = NULL;
+ r = tpm2_context_new(tpm2_device, &tpm2_context);
+ if (r < 0)
+ return r;
+
+ // TODO: Add the SRK data to the credential structure so it can be plumbed
+ // through and used to verify the TPM session.
+ r = tpm2_unseal(tpm2_context,
+ le64toh(t->pcr_mask),
+ le16toh(t->pcr_bank),
+ z ? z->data : NULL,
+ z ? le32toh(z->size) : 0,
+ z ? le64toh(z->pcr_mask) : 0,
+ signature_json,
+ /* pin= */ NULL,
+ /* pcrlock_policy= */ NULL,
+ le16toh(t->primary_alg),
+ t->policy_hash_and_blob,
+ le32toh(t->blob_size),
+ t->policy_hash_and_blob + le32toh(t->blob_size),
+ le32toh(t->policy_hash_size),
+ /* srk_buf= */ NULL,
+ /* srk_buf_size= */ 0,
+ &tpm2_key,
+ &tpm2_key_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to unseal secret using TPM2: %m");
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Credential requires TPM2 support, but TPM2 support not available.");
+#endif
+ }
+
+ if (with_host_key) {
+ r = get_credential_host_secret(
+ 0,
+ &host_key,
+ &host_key_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine local credential key: %m");
+ }
+
+ if (is_tpm2_absent)
+ log_warning("Warning: using a null key for decryption and authentication. Confidentiality or authenticity are not provided.");
+
+ sha256_hash_host_and_tpm2_key(host_key, host_key_size, tpm2_key, tpm2_key_size, md);
+
+ assert_se(cc = EVP_aes_256_gcm());
+
+ /* Make sure cipher expectations match the header */
+ if (EVP_CIPHER_key_length(cc) != (int) le32toh(h->key_size))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected key size in header.");
+ if (EVP_CIPHER_block_size(cc) != (int) le32toh(h->block_size))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Unexpected block size in header.");
+
+ context = EVP_CIPHER_CTX_new();
+ if (!context)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOMEM), "Failed to allocate decryption object: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ if (EVP_DecryptInit_ex(context, cc, NULL, NULL, NULL) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to initialize decryption context: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ if (EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_IVLEN, le32toh(h->iv_size), NULL) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set IV size on decryption context: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ if (EVP_DecryptInit_ex(context, NULL, NULL, md, h->iv) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set IV and key: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ if (EVP_DecryptUpdate(context, NULL, &added, input, p) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to write AAD data: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ plaintext = malloc(input_size - p - le32toh(h->tag_size));
+ if (!plaintext)
+ return -ENOMEM;
+
+ if (EVP_DecryptUpdate(
+ context,
+ plaintext,
+ &added,
+ (uint8_t*) input + p,
+ input_size - p - le32toh(h->tag_size)) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to decrypt data: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ assert(added >= 0);
+ assert((size_t) added <= input_size - p - le32toh(h->tag_size));
+ plaintext_size = added;
+
+ if (EVP_CIPHER_CTX_ctrl(context, EVP_CTRL_GCM_SET_TAG, le32toh(h->tag_size), (uint8_t*) input + input_size - le32toh(h->tag_size)) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to set tag: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ if (EVP_DecryptFinal_ex(context, (uint8_t*) plaintext + plaintext_size, &added) != 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Decryption failed (incorrect key?): %s",
+ ERR_error_string(ERR_get_error(), NULL));
+
+ plaintext_size += added;
+
+ if (plaintext_size < ALIGN8(offsetof(struct metadata_credential_header, name)))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Metadata header incomplete.");
+
+ m = plaintext;
+
+ if (le64toh(m->timestamp) != USEC_INFINITY &&
+ le64toh(m->not_after) != USEC_INFINITY &&
+ le64toh(m->timestamp) >= le64toh(m->not_after))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Timestamps of credential are not in order, refusing.");
+
+ if (le32toh(m->name_size) > CREDENTIAL_NAME_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Embedded credential name too long, refusing.");
+
+ hs = ALIGN8(offsetof(struct metadata_credential_header, name) + le32toh(m->name_size));
+ if (plaintext_size < hs)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Metadata header incomplete.");
+
+ if (le32toh(m->name_size) > 0) {
+ _cleanup_free_ char *embedded_name = NULL;
+
+ r = make_cstring(m->name, le32toh(m->name_size), MAKE_CSTRING_REFUSE_TRAILING_NUL, &embedded_name);
+ if (r < 0)
+ return log_error_errno(r, "Unable to convert embedded credential name to C string: %m");
+
+ if (!credential_name_valid(embedded_name))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Embedded credential name is not valid, refusing.");
+
+ if (validate_name && !streq(embedded_name, validate_name)) {
+
+ r = getenv_bool_secure("SYSTEMD_CREDENTIAL_VALIDATE_NAME");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_CREDENTIAL_VALIDATE_NAME: %m");
+ if (r != 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EREMOTE), "Embedded credential name '%s' does not match filename '%s', refusing.", embedded_name, validate_name);
+
+ log_debug("Embedded credential name '%s' does not match expected name '%s', but configured to use credential anyway.", embedded_name, validate_name);
+ }
+ }
+
+ if (validate_timestamp != USEC_INFINITY) {
+ if (le64toh(m->timestamp) != USEC_INFINITY && le64toh(m->timestamp) > validate_timestamp)
+ log_debug("Credential timestamp is from the future, assuming clock skew.");
+
+ if (le64toh(m->not_after) != USEC_INFINITY && le64toh(m->not_after) < validate_timestamp) {
+
+ r = getenv_bool_secure("SYSTEMD_CREDENTIAL_VALIDATE_NOT_AFTER");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_CREDENTIAL_VALIDATE_NOT_AFTER: %m");
+ if (r != 0)
+ return log_error_errno(SYNTHETIC_ERRNO(ESTALE), "Credential's time passed, refusing to use.");
+
+ log_debug("Credential not-after timestamp has passed, but configured to use credential anyway.");
+ }
+ }
+
+ if (ret) {
+ char *without_metadata;
+
+ without_metadata = memdup((uint8_t*) plaintext + hs, plaintext_size - hs);
+ if (!without_metadata)
+ return log_oom();
+
+ *ret = without_metadata;
+ }
+
+ if (ret_size)
+ *ret_size = plaintext_size - hs;
+
+ return 0;
+}
+
+#else
+
+int get_credential_host_secret(CredentialSecretFlags flags, void **ret, size_t *ret_size) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Support for encrypted credentials not available.");
+}
+
+int encrypt_credential_and_warn(sd_id128_t with_key, const char *name, usec_t timestamp, usec_t not_after, const char *tpm2_device, uint32_t tpm2_hash_pcr_mask, const char *tpm2_pubkey_path, uint32_t tpm2_pubkey_pcr_mask, const void *input, size_t input_size, void **ret, size_t *ret_size) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Support for encrypted credentials not available.");
+}
+
+int decrypt_credential_and_warn(const char *validate_name, usec_t validate_timestamp, const char *tpm2_device, const char *tpm2_signature_path, const void *input, size_t input_size, void **ret, size_t *ret_size) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Support for encrypted credentials not available.");
+}
+
+#endif
diff --git a/src/shared/creds-util.h b/src/shared/creds-util.h
new file mode 100644
index 0000000..5e39a6a
--- /dev/null
+++ b/src/shared/creds-util.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "sd-id128.h"
+
+#include "fd-util.h"
+#include "time-util.h"
+
+#define CREDENTIAL_NAME_MAX FDNAME_MAX
+
+/* Put a size limit on the individual credential */
+#define CREDENTIAL_SIZE_MAX (1024U*1024U)
+
+/* Refuse to store more than 1M per service, after all this is unswappable memory. Note that for now we put
+ * this to the same limit as the per-credential limit, i.e. if the user has n > 1 credentials instead of 1 it
+ * won't get them more space. */
+#define CREDENTIALS_TOTAL_SIZE_MAX CREDENTIAL_SIZE_MAX
+
+/* Put a size limit on encrypted credentials (which is the same as the unencrypted size plus a spacious 128K of extra
+ * space for headers, IVs, exported TPM2 key material and so on. */
+#define CREDENTIAL_ENCRYPTED_SIZE_MAX (CREDENTIAL_SIZE_MAX + 128U*1024U)
+
+bool credential_name_valid(const char *s);
+bool credential_glob_valid(const char *s);
+
+/* Where creds have been passed to the local execution context */
+int get_credentials_dir(const char **ret);
+int get_encrypted_credentials_dir(const char **ret);
+
+/* Where creds have been passed to the system */
+#define SYSTEM_CREDENTIALS_DIRECTORY "/run/credentials/@system"
+#define ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY "/run/credentials/@encrypted"
+
+int read_credential(const char *name, void **ret, size_t *ret_size); /* use in services! */
+int read_credential_with_decryption(const char *name, void **ret, size_t *ret_size); /* use in generators + pid1! */
+
+int read_credential_strings_many_internal(const char *first_name, char **first_value, ...);
+
+#define read_credential_strings_many(first_name, first_value, ...) \
+ read_credential_strings_many_internal(first_name, first_value, __VA_ARGS__, NULL)
+
+int read_credential_bool(const char *name);
+
+typedef enum CredentialSecretFlags {
+ CREDENTIAL_SECRET_GENERATE = 1 << 0,
+ CREDENTIAL_SECRET_WARN_NOT_ENCRYPTED = 1 << 1,
+ CREDENTIAL_SECRET_FAIL_ON_TEMPORARY_FS = 1 << 2,
+} CredentialSecretFlags;
+
+int get_credential_host_secret(CredentialSecretFlags flags, void **ret, size_t *ret_size);
+
+int get_credential_user_password(const char *username, char **ret_password, bool *ret_is_hashed);
+
+/* The four modes we support: keyed only by on-disk key, only by TPM2 HMAC key, and by the combination of
+ * both, as well as one with a fixed zero length key if TPM2 is missing (the latter of course provides no
+ * authenticity or confidentiality, but is still useful for integrity protection, and makes things simpler
+ * for us to handle). */
+#define CRED_AES256_GCM_BY_HOST SD_ID128_MAKE(5a,1c,6a,86,df,9d,40,96,b1,d5,a6,5e,08,62,f1,9a)
+#define CRED_AES256_GCM_BY_TPM2_HMAC SD_ID128_MAKE(0c,7c,c0,7b,11,76,45,91,9c,4b,0b,ea,08,bc,20,fe)
+#define CRED_AES256_GCM_BY_TPM2_HMAC_WITH_PK SD_ID128_MAKE(fa,f7,eb,93,41,e3,41,2c,a1,a4,36,f9,5a,29,36,2f)
+#define CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC SD_ID128_MAKE(93,a8,94,09,48,74,44,90,90,ca,f2,fc,93,ca,b5,53)
+#define CRED_AES256_GCM_BY_HOST_AND_TPM2_HMAC_WITH_PK \
+ SD_ID128_MAKE(af,49,50,a8,49,13,4e,b1,a7,38,46,30,4f,f3,0c,05)
+#define CRED_AES256_GCM_BY_TPM2_ABSENT SD_ID128_MAKE(05,84,69,da,f6,f5,43,24,80,05,49,da,0f,8e,a2,fb)
+
+/* Two special IDs to pick a general automatic mode (i.e. tpm2+host if TPM2 exists, only host otherwise) or
+ * an initrd-specific automatic mode (i.e. tpm2 if firmware can do it, otherwise fixed zero-length key, and
+ * never involve host keys). These IDs will never be stored on disk, but are useful only internally while
+ * figuring out what precisely to write to disk. To mark that these aren't a "real" type, we'll prefix them
+ * with an underscore. */
+#define _CRED_AUTO SD_ID128_MAKE(a2,19,cb,07,85,b2,4c,04,b1,6d,18,ca,b9,d2,ee,01)
+#define _CRED_AUTO_INITRD SD_ID128_MAKE(02,dc,8e,de,3a,02,43,ab,a9,ec,54,9c,05,e6,a0,71)
+
+int encrypt_credential_and_warn(sd_id128_t with_key, const char *name, usec_t timestamp, usec_t not_after, const char *tpm2_device, uint32_t tpm2_hash_pcr_mask, const char *tpm2_pubkey_path, uint32_t tpm2_pubkey_pcr_mask, const void *input, size_t input_size, void **ret, size_t *ret_size);
+int decrypt_credential_and_warn(const char *validate_name, usec_t validate_timestamp, const char *tpm2_device, const char *tpm2_signature_path, const void *input, size_t input_size, void **ret, size_t *ret_size);
diff --git a/src/shared/cryptsetup-fido2.c b/src/shared/cryptsetup-fido2.c
new file mode 100644
index 0000000..285b82a
--- /dev/null
+++ b/src/shared/cryptsetup-fido2.c
@@ -0,0 +1,276 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "ask-password-api.h"
+#include "cryptsetup-fido2.h"
+#include "env-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "json.h"
+#include "libfido2-util.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "strv.h"
+
+int acquire_fido2_key(
+ const char *volume_name,
+ const char *friendly_name,
+ const char *device,
+ const char *rp_id,
+ const void *cid,
+ size_t cid_size,
+ const char *key_file,
+ size_t key_file_size,
+ uint64_t key_file_offset,
+ const void *key_data,
+ size_t key_data_size,
+ usec_t until,
+ bool headless,
+ Fido2EnrollFlags required,
+ void **ret_decrypted_key,
+ size_t *ret_decrypted_key_size,
+ AskPasswordFlags ask_password_flags) {
+
+ _cleanup_(erase_and_freep) char *envpw = NULL;
+ _cleanup_strv_free_erase_ char **pins = NULL;
+ _cleanup_free_ void *loaded_salt = NULL;
+ bool device_exists = false;
+ const char *salt;
+ size_t salt_size;
+ int r;
+
+ if ((required & (FIDO2ENROLL_PIN | FIDO2ENROLL_UP | FIDO2ENROLL_UV)) && headless)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOPKG),
+ "Local verification is required to unlock this volume, but the 'headless' parameter was set.");
+
+ ask_password_flags |= ASK_PASSWORD_PUSH_CACHE | ASK_PASSWORD_ACCEPT_CACHED;
+
+ assert(cid);
+ assert(key_file || key_data);
+
+ if (key_data) {
+ salt = key_data;
+ salt_size = key_data_size;
+ } else {
+ _cleanup_free_ char *bindname = NULL;
+
+ /* If we read the salt via AF_UNIX, make this client recognizable */
+ if (asprintf(&bindname, "@%" PRIx64"/cryptsetup-fido2/%s", random_u64(), volume_name) < 0)
+ return log_oom();
+
+ r = read_full_file_full(
+ AT_FDCWD, key_file,
+ key_file_offset == 0 ? UINT64_MAX : key_file_offset,
+ key_file_size == 0 ? SIZE_MAX : key_file_size,
+ READ_FULL_FILE_CONNECT_SOCKET,
+ bindname,
+ (char**) &loaded_salt, &salt_size);
+ if (r < 0)
+ return r;
+
+ salt = loaded_salt;
+ }
+
+ r = getenv_steal_erase("PIN", &envpw);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire password from environment: %m");
+ if (r > 0) {
+ pins = strv_new(envpw);
+ if (!pins)
+ return log_oom();
+ }
+
+ for (;;) {
+ if (!device_exists) {
+ /* Before we inquire for the PIN we'll need, if we never talked to the device, check
+ * if the device actually is plugged in. Otherwise we'll ask for the PIN already when
+ * the device is not plugged in, which is confusing. */
+
+ r = fido2_have_device(device);
+ if (r < 0)
+ return r;
+ if (r == 0) /* no device found, return EAGAIN so that caller will wait/watch udev */
+ return -EAGAIN;
+
+ device_exists = true; /* now we know for sure, a device exists, no need to ask again */
+ }
+
+ /* Always make an attempt before asking for PIN.
+ * fido2_use_hmac_hash() will perform a pre-flight check for whether the credential for
+ * can be found on one of the connected devices. This way, we can avoid prompting the user
+ * for a PIN when we are sure that no device can be used. */
+ r = fido2_use_hmac_hash(
+ device,
+ rp_id ?: "io.systemd.cryptsetup",
+ salt, salt_size,
+ cid, cid_size,
+ pins,
+ required,
+ ret_decrypted_key,
+ ret_decrypted_key_size);
+ if (!IN_SET(r,
+ -ENOANO, /* needs pin */
+ -ENOLCK)) /* pin incorrect */
+ return r;
+
+ device_exists = true; /* that a PIN is needed/wasn't correct means that we managed to
+ * talk to a device */
+
+ if (headless)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "PIN querying disabled via 'headless' option. Use the '$PIN' environment variable.");
+
+ pins = strv_free_erase(pins);
+ r = ask_password_auto("Please enter security token PIN:", "drive-harddisk", NULL, "fido2-pin", "cryptsetup.fido2-pin", until, ask_password_flags, &pins);
+ if (r < 0)
+ return log_error_errno(r, "Failed to ask for user password: %m");
+
+ ask_password_flags &= ~ASK_PASSWORD_ACCEPT_CACHED;
+ }
+}
+
+int acquire_fido2_key_auto(
+ struct crypt_device *cd,
+ const char *name,
+ const char *friendly_name,
+ const char *fido2_device,
+ usec_t until,
+ bool headless,
+ void **ret_decrypted_key,
+ size_t *ret_decrypted_key_size,
+ AskPasswordFlags ask_password_flags) {
+
+ _cleanup_free_ void *cid = NULL;
+ size_t cid_size = 0;
+ int r, ret = -ENOENT;
+ Fido2EnrollFlags required = 0;
+
+ assert(cd);
+ assert(name);
+ assert(ret_decrypted_key);
+ assert(ret_decrypted_key_size);
+
+ /* Loads FIDO2 metadata from LUKS2 JSON token headers. */
+
+ for (int token = 0; token < sym_crypt_token_max(CRYPT_LUKS2); token ++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ JsonVariant *w;
+ _cleanup_free_ void *salt = NULL;
+ _cleanup_free_ char *rp = NULL;
+ size_t salt_size = 0;
+ int ks;
+
+ r = cryptsetup_get_token_as_json(cd, token, "systemd-fido2", &v);
+ if (IN_SET(r, -ENOENT, -EINVAL, -EMEDIUMTYPE))
+ continue;
+ if (r < 0)
+ return log_error_errno(r, "Failed to read JSON token data off disk: %m");
+
+ ks = cryptsetup_get_keyslot_from_token(v);
+ if (ks < 0) {
+ /* Handle parsing errors of the keyslots field gracefully, since it's not 'owned' by
+ * us, but by the LUKS2 spec */
+ log_warning_errno(ks, "Failed to extract keyslot index from FIDO2 JSON data token %i, skipping: %m", token);
+ continue;
+ }
+
+ w = json_variant_by_key(v, "fido2-credential");
+ if (!w || !json_variant_is_string(w))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "FIDO2 token data lacks 'fido2-credential' field.");
+
+ r = unbase64mem(json_variant_string(w), SIZE_MAX, &cid, &cid_size);
+ if (r < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid base64 data in 'fido2-credential' field.");
+
+ w = json_variant_by_key(v, "fido2-salt");
+ if (!w || !json_variant_is_string(w))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "FIDO2 token data lacks 'fido2-salt' field.");
+
+ assert(!salt);
+ assert(salt_size == 0);
+ r = unbase64mem(json_variant_string(w), SIZE_MAX, &salt, &salt_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to decode base64 encoded salt.");
+
+ w = json_variant_by_key(v, "fido2-rp");
+ if (w) {
+ /* The "rp" field is optional. */
+
+ if (!json_variant_is_string(w))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "FIDO2 token data's 'fido2-rp' field is not a string.");
+
+ assert(!rp);
+ rp = strdup(json_variant_string(w));
+ if (!rp)
+ return log_oom();
+ }
+
+ w = json_variant_by_key(v, "fido2-clientPin-required");
+ if (w) {
+ /* The "fido2-clientPin-required" field is optional. */
+
+ if (!json_variant_is_boolean(w))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "FIDO2 token data's 'fido2-clientPin-required' field is not a boolean.");
+
+ SET_FLAG(required, FIDO2ENROLL_PIN, json_variant_boolean(w));
+ } else
+ required |= FIDO2ENROLL_PIN_IF_NEEDED; /* compat with 248, where the field was unset */
+
+ w = json_variant_by_key(v, "fido2-up-required");
+ if (w) {
+ /* The "fido2-up-required" field is optional. */
+
+ if (!json_variant_is_boolean(w))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "FIDO2 token data's 'fido2-up-required' field is not a boolean.");
+
+ SET_FLAG(required, FIDO2ENROLL_UP, json_variant_boolean(w));
+ } else
+ required |= FIDO2ENROLL_UP_IF_NEEDED; /* compat with 248 */
+
+ w = json_variant_by_key(v, "fido2-uv-required");
+ if (w) {
+ /* The "fido2-uv-required" field is optional. */
+
+ if (!json_variant_is_boolean(w))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "FIDO2 token data's 'fido2-uv-required' field is not a boolean.");
+
+ SET_FLAG(required, FIDO2ENROLL_UV, json_variant_boolean(w));
+ } else
+ required |= FIDO2ENROLL_UV_OMIT; /* compat with 248 */
+
+ ret = acquire_fido2_key(
+ name,
+ friendly_name,
+ fido2_device,
+ rp,
+ cid, cid_size,
+ /* key_file= */ NULL, /* salt is read from LUKS header instead of key_file */
+ /* key_file_size= */ 0,
+ /* key_file_offset= */ 0,
+ salt, salt_size,
+ until,
+ headless,
+ required,
+ ret_decrypted_key, ret_decrypted_key_size,
+ ask_password_flags);
+ if (ret == 0)
+ break;
+ }
+
+ if (!cid)
+ return log_error_errno(SYNTHETIC_ERRNO(ENXIO),
+ "No valid FIDO2 token data found.");
+
+ if (ret == -EAGAIN) /* fido2 device does not exist, or UV is blocked; caller will prompt for retry */
+ return log_debug_errno(ret, "FIDO2 token does not exist, or UV is blocked.");
+ if (ret < 0)
+ return log_error_errno(ret, "Failed to unlock LUKS volume with FIDO2 token: %m");
+
+ log_info("Unlocked volume via automatically discovered security FIDO2 token.");
+ return ret;
+}
diff --git a/src/shared/cryptsetup-fido2.h b/src/shared/cryptsetup-fido2.h
new file mode 100644
index 0000000..d96bb40
--- /dev/null
+++ b/src/shared/cryptsetup-fido2.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "cryptsetup-util.h"
+#include "libfido2-util.h"
+#include "log.h"
+#include "time-util.h"
+
+#if HAVE_LIBFIDO2
+
+int acquire_fido2_key(
+ const char *volume_name,
+ const char *friendly_name,
+ const char *device,
+ const char *rp_id,
+ const void *cid,
+ size_t cid_size,
+ const char *key_file,
+ size_t key_file_size,
+ uint64_t key_file_offset,
+ const void *key_data,
+ size_t key_data_size,
+ usec_t until,
+ bool headless,
+ Fido2EnrollFlags required,
+ void **ret_decrypted_key,
+ size_t *ret_decrypted_key_size,
+ AskPasswordFlags ask_password_flags);
+
+int acquire_fido2_key_auto(
+ struct crypt_device *cd,
+ const char *name,
+ const char *friendly_name,
+ const char *fido2_device,
+ usec_t until,
+ bool headless,
+ void **ret_decrypted_key,
+ size_t *ret_decrypted_key_size,
+ AskPasswordFlags ask_password_flags);
+
+#else
+
+static inline int acquire_fido2_key(
+ const char *volume_name,
+ const char *friendly_name,
+ const char *device,
+ const char *rp_id,
+ const void *cid,
+ size_t cid_size,
+ const char *key_file,
+ size_t key_file_size,
+ uint64_t key_file_offset,
+ const void *key_data,
+ size_t key_data_size,
+ usec_t until,
+ bool headless,
+ Fido2EnrollFlags required,
+ void **ret_decrypted_key,
+ size_t *ret_decrypted_key_size,
+ AskPasswordFlags ask_password_flags) {
+
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "FIDO2 token support not available.");
+}
+
+static inline int acquire_fido2_key_auto(
+ struct crypt_device *cd,
+ const char *name,
+ const char *friendly_name,
+ const char *fido2_device,
+ usec_t until,
+ bool headless,
+ void **ret_decrypted_key,
+ size_t *ret_decrypted_key_size,
+ AskPasswordFlags ask_password_flags) {
+
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "FIDO2 token support not available.");
+}
+#endif
diff --git a/src/shared/cryptsetup-util.c b/src/shared/cryptsetup-util.c
new file mode 100644
index 0000000..ab5764d
--- /dev/null
+++ b/src/shared/cryptsetup-util.c
@@ -0,0 +1,349 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "cryptsetup-util.h"
+#include "dlfcn-util.h"
+#include "log.h"
+#include "parse-util.h"
+
+#if HAVE_LIBCRYPTSETUP
+static void *cryptsetup_dl = NULL;
+
+int (*sym_crypt_activate_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size, uint32_t flags);
+#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY
+int (*sym_crypt_activate_by_signed_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, const char *signature, size_t signature_size, uint32_t flags);
+#endif
+int (*sym_crypt_activate_by_volume_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, uint32_t flags);
+int (*sym_crypt_deactivate_by_name)(struct crypt_device *cd, const char *name, uint32_t flags);
+int (*sym_crypt_format)(struct crypt_device *cd, const char *type, const char *cipher, const char *cipher_mode, const char *uuid, const char *volume_key, size_t volume_key_size, void *params);
+void (*sym_crypt_free)(struct crypt_device *cd);
+const char *(*sym_crypt_get_cipher)(struct crypt_device *cd);
+const char *(*sym_crypt_get_cipher_mode)(struct crypt_device *cd);
+uint64_t (*sym_crypt_get_data_offset)(struct crypt_device *cd);
+const char *(*sym_crypt_get_device_name)(struct crypt_device *cd);
+const char *(*sym_crypt_get_dir)(void);
+const char *(*sym_crypt_get_type)(struct crypt_device *cd);
+const char *(*sym_crypt_get_uuid)(struct crypt_device *cd);
+int (*sym_crypt_get_verity_info)(struct crypt_device *cd, struct crypt_params_verity *vp);
+int (*sym_crypt_get_volume_key_size)(struct crypt_device *cd);
+int (*sym_crypt_init)(struct crypt_device **cd, const char *device);
+int (*sym_crypt_init_by_name)(struct crypt_device **cd, const char *name);
+int (*sym_crypt_keyslot_add_by_volume_key)(struct crypt_device *cd, int keyslot, const char *volume_key, size_t volume_key_size, const char *passphrase, size_t passphrase_size);
+int (*sym_crypt_keyslot_destroy)(struct crypt_device *cd, int keyslot);
+int (*sym_crypt_keyslot_max)(const char *type);
+int (*sym_crypt_load)(struct crypt_device *cd, const char *requested_type, void *params);
+int (*sym_crypt_resize)(struct crypt_device *cd, const char *name, uint64_t new_size);
+int (*sym_crypt_resume_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size);
+int (*sym_crypt_set_data_device)(struct crypt_device *cd, const char *device);
+void (*sym_crypt_set_debug_level)(int level);
+void (*sym_crypt_set_log_callback)(struct crypt_device *cd, void (*log)(int level, const char *msg, void *usrptr), void *usrptr);
+#if HAVE_CRYPT_SET_METADATA_SIZE
+int (*sym_crypt_set_metadata_size)(struct crypt_device *cd, uint64_t metadata_size, uint64_t keyslots_size);
+#endif
+int (*sym_crypt_set_pbkdf_type)(struct crypt_device *cd, const struct crypt_pbkdf_type *pbkdf);
+int (*sym_crypt_suspend)(struct crypt_device *cd, const char *name);
+int (*sym_crypt_token_json_get)(struct crypt_device *cd, int token, const char **json);
+int (*sym_crypt_token_json_set)(struct crypt_device *cd, int token, const char *json);
+#if HAVE_CRYPT_TOKEN_MAX
+int (*sym_crypt_token_max)(const char *type);
+#endif
+crypt_token_info (*sym_crypt_token_status)(struct crypt_device *cd, int token, const char **type);
+int (*sym_crypt_volume_key_get)(struct crypt_device *cd, int keyslot, char *volume_key, size_t *volume_key_size, const char *passphrase, size_t passphrase_size);
+#if HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE
+int (*sym_crypt_reencrypt_init_by_passphrase)(struct crypt_device *cd, const char *name, const char *passphrase, size_t passphrase_size, int keyslot_old, int keyslot_new, const char *cipher, const char *cipher_mode, const struct crypt_params_reencrypt *params);
+#endif
+#if HAVE_CRYPT_REENCRYPT
+int (*sym_crypt_reencrypt)(struct crypt_device *cd, int (*progress)(uint64_t size, uint64_t offset, void *usrptr));
+#endif
+int (*sym_crypt_metadata_locking)(struct crypt_device *cd, int enable);
+#if HAVE_CRYPT_SET_DATA_OFFSET
+int (*sym_crypt_set_data_offset)(struct crypt_device *cd, uint64_t data_offset);
+#endif
+int (*sym_crypt_header_restore)(struct crypt_device *cd, const char *requested_type, const char *backup_file);
+int (*sym_crypt_volume_key_keyring)(struct crypt_device *cd, int enable);
+
+/* Unfortunately libcryptsetup provides neither an environment variable to redirect where to look for token
+ * modules, nor does it have an API to change the token lookup path at runtime. The maintainers suggest using
+ * ELF interposition instead (see https://gitlab.com/cryptsetup/cryptsetup/-/issues/846). Hence let's do
+ * that: let's interpose libcryptsetup's crypt_token_external_path() function with our own, that *does*
+ * honour an environment variable where to look for tokens. This is tremendously useful for debugging
+ * libcryptsetup tokens: set the environment variable to your build dir and you can easily test token modules
+ * without jumping through various hoops. */
+
+/* Do this only on new enough compilers that actually support the "symver" attribute. Given this is a debug
+ * feature, let's simply not bother on older compilers */
+#if BUILD_MODE_DEVELOPER && defined(__has_attribute) && __has_attribute(symver)
+const char *my_crypt_token_external_path(void); /* prototype for our own implementation */
+
+/* We use the "symver" attribute to mark this implementation as the default implementation, and drop the
+ * SD_SHARED namespace we by default attach to our symbols via a version script. */
+__attribute__((symver("crypt_token_external_path@@")))
+_public_ const char *my_crypt_token_external_path(void) {
+ const char *e;
+
+ e = secure_getenv("SYSTEMD_CRYPTSETUP_TOKEN_PATH");
+ if (e)
+ return e;
+
+ /* Now chain invoke the original implementation. */
+ if (cryptsetup_dl) {
+ typeof(crypt_token_external_path) *func;
+ func = (typeof(crypt_token_external_path)*) dlsym(cryptsetup_dl, "crypt_token_external_path");
+ if (func)
+ return func();
+ }
+
+ return NULL;
+}
+#endif
+
+static void cryptsetup_log_glue(int level, const char *msg, void *usrptr) {
+
+ switch (level) {
+ case CRYPT_LOG_NORMAL:
+ level = LOG_NOTICE;
+ break;
+ case CRYPT_LOG_ERROR:
+ level = LOG_ERR;
+ break;
+ case CRYPT_LOG_VERBOSE:
+ level = LOG_INFO;
+ break;
+ case CRYPT_LOG_DEBUG:
+ level = LOG_DEBUG;
+ break;
+ default:
+ log_error("Unknown libcryptsetup log level: %d", level);
+ level = LOG_ERR;
+ }
+
+ log_full(level, "%s", msg);
+}
+
+void cryptsetup_enable_logging(struct crypt_device *cd) {
+ /* It's OK to call this with a NULL parameter, in which case libcryptsetup will set the default log
+ * function.
+ *
+ * Note that this is also called from dlopen_cryptsetup(), which we call here too. Sounds like an
+ * endless loop, but isn't because we break it via the check for 'cryptsetup_dl' early in
+ * dlopen_cryptsetup(). */
+
+ if (dlopen_cryptsetup() < 0)
+ return; /* If this fails, let's gracefully ignore the issue, this is just debug logging after
+ * all, and if this failed we already generated a debug log message that should help
+ * to track things down. */
+
+ sym_crypt_set_log_callback(cd, cryptsetup_log_glue, NULL);
+ sym_crypt_set_debug_level(DEBUG_LOGGING ? CRYPT_DEBUG_ALL : CRYPT_DEBUG_NONE);
+}
+
+int cryptsetup_set_minimal_pbkdf(struct crypt_device *cd) {
+
+ /* With CRYPT_PBKDF_NO_BENCHMARK flag set .time_ms member is ignored
+ * while .iterations must be set at least to recommended minimum value. */
+
+ static const struct crypt_pbkdf_type minimal_pbkdf = {
+ .hash = "sha512",
+ .type = CRYPT_KDF_PBKDF2,
+ .iterations = 1000, /* recommended minimum count for pbkdf2
+ * according to NIST SP 800-132, ch. 5.2 */
+ .flags = CRYPT_PBKDF_NO_BENCHMARK
+ };
+
+ int r;
+
+ /* Sets a minimal PKBDF in case we already have a high entropy key. */
+
+ r = dlopen_cryptsetup();
+ if (r < 0)
+ return r;
+
+ r = sym_crypt_set_pbkdf_type(cd, &minimal_pbkdf);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int cryptsetup_get_token_as_json(
+ struct crypt_device *cd,
+ int idx,
+ const char *verify_type,
+ JsonVariant **ret) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ const char *text;
+ int r;
+
+ assert(cd);
+
+ /* Extracts and parses the LUKS2 JSON token data from a LUKS2 device. Optionally verifies the type of
+ * the token. Returns:
+ *
+ * -EINVAL → token index out of range or "type" field missing
+ * -ENOENT → token doesn't exist
+ * -EMEDIUMTYPE → "verify_type" specified and doesn't match token's type
+ */
+
+ r = dlopen_cryptsetup();
+ if (r < 0)
+ return r;
+
+ r = sym_crypt_token_json_get(cd, idx, &text);
+ if (r < 0)
+ return r;
+
+ r = json_parse(text, 0, &v, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ if (verify_type) {
+ JsonVariant *w;
+
+ w = json_variant_by_key(v, "type");
+ if (!w)
+ return -EINVAL;
+
+ if (!streq_ptr(json_variant_string(w), verify_type))
+ return -EMEDIUMTYPE;
+ }
+
+ if (ret)
+ *ret = TAKE_PTR(v);
+
+ return 0;
+}
+
+int cryptsetup_add_token_json(struct crypt_device *cd, JsonVariant *v) {
+ _cleanup_free_ char *text = NULL;
+ int r;
+
+ r = dlopen_cryptsetup();
+ if (r < 0)
+ return r;
+
+ r = json_variant_format(v, 0, &text);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to format token data for LUKS: %m");
+
+ log_debug("Adding token text <%s>", text);
+
+ r = sym_crypt_token_json_set(cd, CRYPT_ANY_TOKEN, text);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to write token data to LUKS: %m");
+
+ return 0;
+}
+#endif
+
+int dlopen_cryptsetup(void) {
+#if HAVE_LIBCRYPTSETUP
+ int r;
+
+ /* libcryptsetup added crypt_reencrypt() in 2.2.0, and marked it obsolete in 2.4.0, replacing it with
+ * crypt_reencrypt_run(), which takes one extra argument but is otherwise identical. The old call is
+ * still available though, and given we want to support 2.2.0 for a while longer, we'll stick to the
+ * old symbol. However, the old symbols now has a GCC deprecation decorator, hence let's turn off
+ * warnings about this for now. */
+
+ DISABLE_WARNING_DEPRECATED_DECLARATIONS;
+
+ r = dlopen_many_sym_or_warn(
+ &cryptsetup_dl, "libcryptsetup.so.12", LOG_DEBUG,
+ DLSYM_ARG(crypt_activate_by_passphrase),
+#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY
+ DLSYM_ARG(crypt_activate_by_signed_key),
+#endif
+ DLSYM_ARG(crypt_activate_by_volume_key),
+ DLSYM_ARG(crypt_deactivate_by_name),
+ DLSYM_ARG(crypt_format),
+ DLSYM_ARG(crypt_free),
+ DLSYM_ARG(crypt_get_cipher),
+ DLSYM_ARG(crypt_get_cipher_mode),
+ DLSYM_ARG(crypt_get_data_offset),
+ DLSYM_ARG(crypt_get_device_name),
+ DLSYM_ARG(crypt_get_dir),
+ DLSYM_ARG(crypt_get_type),
+ DLSYM_ARG(crypt_get_uuid),
+ DLSYM_ARG(crypt_get_verity_info),
+ DLSYM_ARG(crypt_get_volume_key_size),
+ DLSYM_ARG(crypt_init),
+ DLSYM_ARG(crypt_init_by_name),
+ DLSYM_ARG(crypt_keyslot_add_by_volume_key),
+ DLSYM_ARG(crypt_keyslot_destroy),
+ DLSYM_ARG(crypt_keyslot_max),
+ DLSYM_ARG(crypt_load),
+ DLSYM_ARG(crypt_resize),
+ DLSYM_ARG(crypt_resume_by_passphrase),
+ DLSYM_ARG(crypt_set_data_device),
+ DLSYM_ARG(crypt_set_debug_level),
+ DLSYM_ARG(crypt_set_log_callback),
+#if HAVE_CRYPT_SET_METADATA_SIZE
+ DLSYM_ARG(crypt_set_metadata_size),
+#endif
+ DLSYM_ARG(crypt_set_pbkdf_type),
+ DLSYM_ARG(crypt_suspend),
+ DLSYM_ARG(crypt_token_json_get),
+ DLSYM_ARG(crypt_token_json_set),
+#if HAVE_CRYPT_TOKEN_MAX
+ DLSYM_ARG(crypt_token_max),
+#endif
+ DLSYM_ARG(crypt_token_status),
+ DLSYM_ARG(crypt_volume_key_get),
+#if HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE
+ DLSYM_ARG(crypt_reencrypt_init_by_passphrase),
+#endif
+#if HAVE_CRYPT_REENCRYPT
+ DLSYM_ARG(crypt_reencrypt),
+#endif
+ DLSYM_ARG(crypt_metadata_locking),
+#if HAVE_CRYPT_SET_DATA_OFFSET
+ DLSYM_ARG(crypt_set_data_offset),
+#endif
+ DLSYM_ARG(crypt_header_restore),
+ DLSYM_ARG(crypt_volume_key_keyring));
+ if (r <= 0)
+ return r;
+
+ REENABLE_WARNING;
+
+ /* Redirect the default logging calls of libcryptsetup to our own logging infra. (Note that
+ * libcryptsetup also maintains per-"struct crypt_device" log functions, which we'll also set
+ * whenever allocating a "struct crypt_device" context. Why set both? To be defensive: maybe some
+ * other code loaded into this process also changes the global log functions of libcryptsetup, who
+ * knows? And if so, we still want our own objects to log via our own infra, at the very least.) */
+ cryptsetup_enable_logging(NULL);
+ return 1;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "cryptsetup support is not compiled in.");
+#endif
+}
+
+int cryptsetup_get_keyslot_from_token(JsonVariant *v) {
+ int keyslot, r;
+ JsonVariant *w;
+
+ /* Parses the "keyslots" field of a LUKS2 token object. The field can be an array, but here we assume
+ * that it contains a single element only, since that's the only way we ever generate it
+ * ourselves. */
+
+ w = json_variant_by_key(v, "keyslots");
+ if (!w)
+ return -ENOENT;
+ if (!json_variant_is_array(w) || json_variant_elements(w) != 1)
+ return -EMEDIUMTYPE;
+
+ w = json_variant_by_index(w, 0);
+ if (!w)
+ return -ENOENT;
+ if (!json_variant_is_string(w))
+ return -EMEDIUMTYPE;
+
+ r = safe_atoi(json_variant_string(w), &keyslot);
+ if (r < 0)
+ return r;
+ if (keyslot < 0)
+ return -EINVAL;
+
+ return keyslot;
+}
diff --git a/src/shared/cryptsetup-util.h b/src/shared/cryptsetup-util.h
new file mode 100644
index 0000000..5ff439d
--- /dev/null
+++ b/src/shared/cryptsetup-util.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "alloc-util.h"
+#include "json.h"
+#include "macro.h"
+
+#if HAVE_LIBCRYPTSETUP
+#include <libcryptsetup.h>
+
+/* These next two are defined in libcryptsetup.h from cryptsetup version 2.3.4 forwards. */
+#ifndef CRYPT_ACTIVATE_NO_READ_WORKQUEUE
+#define CRYPT_ACTIVATE_NO_READ_WORKQUEUE (1 << 24)
+#endif
+#ifndef CRYPT_ACTIVATE_NO_WRITE_WORKQUEUE
+#define CRYPT_ACTIVATE_NO_WRITE_WORKQUEUE (1 << 25)
+#endif
+
+extern int (*sym_crypt_activate_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size, uint32_t flags);
+#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY
+extern int (*sym_crypt_activate_by_signed_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, const char *signature, size_t signature_size, uint32_t flags);
+#endif
+extern int (*sym_crypt_activate_by_volume_key)(struct crypt_device *cd, const char *name, const char *volume_key, size_t volume_key_size, uint32_t flags);
+extern int (*sym_crypt_deactivate_by_name)(struct crypt_device *cd, const char *name, uint32_t flags);
+extern int (*sym_crypt_format)(struct crypt_device *cd, const char *type, const char *cipher, const char *cipher_mode, const char *uuid, const char *volume_key, size_t volume_key_size, void *params);
+extern void (*sym_crypt_free)(struct crypt_device *cd);
+extern const char *(*sym_crypt_get_cipher)(struct crypt_device *cd);
+extern const char *(*sym_crypt_get_cipher_mode)(struct crypt_device *cd);
+extern uint64_t (*sym_crypt_get_data_offset)(struct crypt_device *cd);
+extern const char *(*sym_crypt_get_device_name)(struct crypt_device *cd);
+extern const char *(*sym_crypt_get_dir)(void);
+extern const char *(*sym_crypt_get_type)(struct crypt_device *cd);
+extern const char *(*sym_crypt_get_uuid)(struct crypt_device *cd);
+extern int (*sym_crypt_get_verity_info)(struct crypt_device *cd, struct crypt_params_verity *vp);
+extern int (*sym_crypt_get_volume_key_size)(struct crypt_device *cd);
+extern int (*sym_crypt_init)(struct crypt_device **cd, const char *device);
+extern int (*sym_crypt_init_by_name)(struct crypt_device **cd, const char *name);
+extern int (*sym_crypt_keyslot_add_by_volume_key)(struct crypt_device *cd, int keyslot, const char *volume_key, size_t volume_key_size, const char *passphrase, size_t passphrase_size);
+extern int (*sym_crypt_keyslot_destroy)(struct crypt_device *cd, int keyslot);
+extern int (*sym_crypt_keyslot_max)(const char *type);
+extern int (*sym_crypt_load)(struct crypt_device *cd, const char *requested_type, void *params);
+extern int (*sym_crypt_resize)(struct crypt_device *cd, const char *name, uint64_t new_size);
+extern int (*sym_crypt_resume_by_passphrase)(struct crypt_device *cd, const char *name, int keyslot, const char *passphrase, size_t passphrase_size);
+extern int (*sym_crypt_set_data_device)(struct crypt_device *cd, const char *device);
+extern void (*sym_crypt_set_debug_level)(int level);
+extern void (*sym_crypt_set_log_callback)(struct crypt_device *cd, void (*log)(int level, const char *msg, void *usrptr), void *usrptr);
+#if HAVE_CRYPT_SET_METADATA_SIZE
+extern int (*sym_crypt_set_metadata_size)(struct crypt_device *cd, uint64_t metadata_size, uint64_t keyslots_size);
+#endif
+extern int (*sym_crypt_set_pbkdf_type)(struct crypt_device *cd, const struct crypt_pbkdf_type *pbkdf);
+extern int (*sym_crypt_suspend)(struct crypt_device *cd, const char *name);
+extern int (*sym_crypt_token_json_get)(struct crypt_device *cd, int token, const char **json);
+extern int (*sym_crypt_token_json_set)(struct crypt_device *cd, int token, const char *json);
+#if HAVE_CRYPT_TOKEN_MAX
+extern int (*sym_crypt_token_max)(const char *type);
+#else
+/* As a fallback, use the same hard-coded value libcryptsetup uses internally. */
+static inline int crypt_token_max(_unused_ const char *type) {
+ assert(streq(type, CRYPT_LUKS2));
+
+ return 32;
+}
+#define sym_crypt_token_max(type) crypt_token_max(type)
+#endif
+extern crypt_token_info (*sym_crypt_token_status)(struct crypt_device *cd, int token, const char **type);
+extern int (*sym_crypt_volume_key_get)(struct crypt_device *cd, int keyslot, char *volume_key, size_t *volume_key_size, const char *passphrase, size_t passphrase_size);
+#if HAVE_CRYPT_REENCRYPT_INIT_BY_PASSPHRASE
+extern int (*sym_crypt_reencrypt_init_by_passphrase)(struct crypt_device *cd, const char *name, const char *passphrase, size_t passphrase_size, int keyslot_old, int keyslot_new, const char *cipher, const char *cipher_mode, const struct crypt_params_reencrypt *params);
+#endif
+#if HAVE_CRYPT_REENCRYPT
+extern int (*sym_crypt_reencrypt)(struct crypt_device *cd, int (*progress)(uint64_t size, uint64_t offset, void *usrptr));
+#endif
+extern int (*sym_crypt_metadata_locking)(struct crypt_device *cd, int enable);
+#if HAVE_CRYPT_SET_DATA_OFFSET
+extern int (*sym_crypt_set_data_offset)(struct crypt_device *cd, uint64_t data_offset);
+#endif
+extern int (*sym_crypt_header_restore)(struct crypt_device *cd, const char *requested_type, const char *backup_file);
+extern int (*sym_crypt_volume_key_keyring)(struct crypt_device *cd, int enable);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct crypt_device *, crypt_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct crypt_device *, sym_crypt_free, NULL);
+
+/* Be careful, this works with dlopen_cryptsetup(), that is, it calls sym_crypt_free() instead of crypt_free(). */
+#define crypt_free_and_replace(a, b) \
+ free_and_replace_full(a, b, sym_crypt_free)
+
+void cryptsetup_enable_logging(struct crypt_device *cd);
+
+int cryptsetup_set_minimal_pbkdf(struct crypt_device *cd);
+
+int cryptsetup_get_token_as_json(struct crypt_device *cd, int idx, const char *verify_type, JsonVariant **ret);
+int cryptsetup_add_token_json(struct crypt_device *cd, JsonVariant *v);
+
+#else
+
+/* If libcryptsetup is not available, let's at least define the basic type and NOP destructors for it, to
+ * make a little bit less #ifdeferry necessary in main programs. */
+struct crypt_device;
+static inline void sym_crypt_free(struct crypt_device* cd) {}
+static inline void sym_crypt_freep(struct crypt_device** cd) {}
+
+#endif
+
+int dlopen_cryptsetup(void);
+
+int cryptsetup_get_keyslot_from_token(JsonVariant *v);
+
+static inline const char *mangle_none(const char *s) {
+ /* A helper that turns cryptsetup/integritysetup/veritysetup "options" strings into NULL if they are effectively empty */
+ return isempty(s) || STR_IN_SET(s, "-", "none") ? NULL : s;
+}
diff --git a/src/shared/daemon-util.c b/src/shared/daemon-util.c
new file mode 100644
index 0000000..32180a1
--- /dev/null
+++ b/src/shared/daemon-util.c
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "daemon-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "string-util.h"
+
+static int notify_remove_fd_warn(const char *name) {
+ int r;
+
+ assert(name);
+
+ r = sd_notifyf(/* unset_environment = */ false,
+ "FDSTOREREMOVE=1\n"
+ "FDNAME=%s", name);
+ if (r < 0)
+ return log_warning_errno(r,
+ "Failed to remove file descriptor \"%s\" from the store, ignoring: %m",
+ name);
+
+ return 0;
+}
+
+int notify_remove_fd_warnf(const char *format, ...) {
+ _cleanup_free_ char *p = NULL;
+ va_list ap;
+ int r;
+
+ assert(format);
+
+ va_start(ap, format);
+ r = vasprintf(&p, format, ap);
+ va_end(ap);
+ if (r < 0)
+ return log_oom();
+
+ return notify_remove_fd_warn(p);
+}
+
+int close_and_notify_warn(int fd, const char *name) {
+ if (name)
+ (void) notify_remove_fd_warn(name);
+
+ return safe_close(fd);
+}
+
+static int notify_push_fd(int fd, const char *name) {
+ _cleanup_free_ char *state = NULL;
+
+ assert(fd >= 0);
+ assert(name);
+
+ state = strjoin("FDSTORE=1\n"
+ "FDNAME=", name);
+ if (!state)
+ return -ENOMEM;
+
+ return sd_pid_notify_with_fds(0, /* unset_environment = */ false, state, &fd, 1);
+}
+
+int notify_push_fdf(int fd, const char *format, ...) {
+ _cleanup_free_ char *name = NULL;
+ va_list ap;
+ int r;
+
+ assert(fd >= 0);
+ assert(format);
+
+ va_start(ap, format);
+ r = vasprintf(&name, format, ap);
+ va_end(ap);
+ if (r < 0)
+ return -ENOMEM;
+
+ return notify_push_fd(fd, name);
+}
diff --git a/src/shared/daemon-util.h b/src/shared/daemon-util.h
new file mode 100644
index 0000000..711885b
--- /dev/null
+++ b/src/shared/daemon-util.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-daemon.h"
+
+#include "macro.h"
+
+#define NOTIFY_READY "READY=1\n" "STATUS=Processing requests..."
+#define NOTIFY_STOPPING "STOPPING=1\n" "STATUS=Shutting down..."
+
+static inline const char *notify_start(const char *start, const char *stop) {
+ if (start)
+ (void) sd_notify(false, start);
+
+ return stop;
+}
+
+/* This is intended to be used with _cleanup_ attribute. */
+static inline void notify_on_cleanup(const char **p) {
+ if (*p)
+ (void) sd_notify(false, *p);
+}
+
+int notify_remove_fd_warnf(const char *format, ...) _printf_(1, 2);
+int close_and_notify_warn(int fd, const char *name);
+int notify_push_fdf(int fd, const char *format, ...) _printf_(2, 3);
diff --git a/src/shared/data-fd-util.c b/src/shared/data-fd-util.c
new file mode 100644
index 0000000..b939206
--- /dev/null
+++ b/src/shared/data-fd-util.c
@@ -0,0 +1,391 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#if HAVE_LINUX_MEMFD_H
+#include <linux/memfd.h>
+#endif
+
+#include "alloc-util.h"
+#include "copy.h"
+#include "data-fd-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "io-util.h"
+#include "memfd-util.h"
+#include "missing_mman.h"
+#include "missing_syscall.h"
+#include "tmpfile-util.h"
+
+/* When the data is smaller or equal to 64K, try to place the copy in a memfd/pipe */
+#define DATA_FD_MEMORY_LIMIT (64U*1024U)
+
+/* If memfd/pipe didn't work out, then let's use a file in /tmp up to a size of 1M. If it's large than that use /var/tmp instead. */
+#define DATA_FD_TMP_LIMIT (1024U*1024U)
+
+int acquire_data_fd(const void *data, size_t size, unsigned flags) {
+ _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR;
+ _cleanup_close_ int fd = -EBADF;
+ int isz = 0, r;
+ ssize_t n;
+
+ assert(data || size == 0);
+
+ /* Acquire a read-only file descriptor that when read from returns the specified data. This is much more
+ * complex than I wish it was. But here's why:
+ *
+ * a) First we try to use memfds. They are the best option, as we can seal them nicely to make them
+ * read-only. Unfortunately they require kernel 3.17, and – at the time of writing – we still support 3.14.
+ *
+ * b) Then, we try classic pipes. They are the second best options, as we can close the writing side, retaining
+ * a nicely read-only fd in the reading side. However, they are by default quite small, and unprivileged
+ * clients can only bump their size to a system-wide limit, which might be quite low.
+ *
+ * c) Then, we try an O_TMPFILE file in /dev/shm (that dir is the only suitable one known to exist from
+ * earliest boot on). To make it read-only we open the fd a second time with O_RDONLY via
+ * /proc/self/<fd>. Unfortunately O_TMPFILE is not available on older kernels on tmpfs.
+ *
+ * d) Finally, we try creating a regular file in /dev/shm, which we then delete.
+ *
+ * It sucks a bit that depending on the situation we return very different objects here, but that's Linux I
+ * figure. */
+
+ if (size == 0 && ((flags & ACQUIRE_NO_DEV_NULL) == 0))
+ /* As a special case, return /dev/null if we have been called for an empty data block */
+ return RET_NERRNO(open("/dev/null", O_RDONLY|O_CLOEXEC|O_NOCTTY));
+
+ if ((flags & ACQUIRE_NO_MEMFD) == 0) {
+ fd = memfd_new_and_seal("data-fd", data, size);
+ if (fd < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(fd))
+ goto try_pipe;
+
+ return fd;
+ }
+
+ return TAKE_FD(fd);
+ }
+
+try_pipe:
+ if ((flags & ACQUIRE_NO_PIPE) == 0) {
+ if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
+ return -errno;
+
+ isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
+ if (isz < 0)
+ return -errno;
+
+ if ((size_t) isz < size) {
+ isz = (int) size;
+ if (isz < 0 || (size_t) isz != size)
+ return -E2BIG;
+
+ /* Try to bump the pipe size */
+ (void) fcntl(pipefds[1], F_SETPIPE_SZ, isz);
+
+ /* See if that worked */
+ isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
+ if (isz < 0)
+ return -errno;
+
+ if ((size_t) isz < size)
+ goto try_dev_shm;
+ }
+
+ n = write(pipefds[1], data, size);
+ if (n < 0)
+ return -errno;
+ if ((size_t) n != size)
+ return -EIO;
+
+ (void) fd_nonblock(pipefds[0], false);
+
+ return TAKE_FD(pipefds[0]);
+ }
+
+try_dev_shm:
+ if ((flags & ACQUIRE_NO_TMPFILE) == 0) {
+ fd = open("/dev/shm", O_RDWR|O_TMPFILE|O_CLOEXEC, 0500);
+ if (fd < 0)
+ goto try_dev_shm_without_o_tmpfile;
+
+ n = write(fd, data, size);
+ if (n < 0)
+ return -errno;
+ if ((size_t) n != size)
+ return -EIO;
+
+ /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
+ return fd_reopen(fd, O_RDONLY|O_CLOEXEC);
+ }
+
+try_dev_shm_without_o_tmpfile:
+ if ((flags & ACQUIRE_NO_REGULAR) == 0) {
+ char pattern[] = "/dev/shm/data-fd-XXXXXX";
+
+ fd = mkostemp_safe(pattern);
+ if (fd < 0)
+ return fd;
+
+ n = write(fd, data, size);
+ if (n < 0) {
+ r = -errno;
+ goto unlink_and_return;
+ }
+ if ((size_t) n != size) {
+ r = -EIO;
+ goto unlink_and_return;
+ }
+
+ /* Let's reopen the thing, in order to get an O_RDONLY fd for the original O_RDWR one */
+ r = fd_reopen(fd, O_RDONLY|O_CLOEXEC);
+
+ unlink_and_return:
+ (void) unlink(pattern);
+ return r;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+int copy_data_fd(int fd) {
+ _cleanup_close_ int copy_fd = -EBADF, tmp_fd = -EBADF;
+ _cleanup_free_ void *remains = NULL;
+ size_t remains_size = 0;
+ const char *td;
+ struct stat st;
+ int r;
+
+ /* Creates a 'data' fd from the specified source fd, containing all the same data in a read-only fashion, but
+ * independent of it (i.e. the source fd can be closed and unmounted after this call succeeded). Tries to be
+ * somewhat smart about where to place the data. In the best case uses a memfd(). If memfd() are not supported
+ * uses a pipe instead. For larger data will use an unlinked file in /tmp, and for even larger data one in
+ * /var/tmp. */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* For now, let's only accept regular files, sockets, pipes and char devices */
+ if (S_ISDIR(st.st_mode))
+ return -EISDIR;
+ if (S_ISLNK(st.st_mode))
+ return -ELOOP;
+ if (!S_ISREG(st.st_mode) && !S_ISSOCK(st.st_mode) && !S_ISFIFO(st.st_mode) && !S_ISCHR(st.st_mode))
+ return -EBADFD;
+
+ /* If we have reason to believe the data is bounded in size, then let's use memfds or pipes as backing fd. Note
+ * that we use the reported regular file size only as a hint, given that there are plenty special files in
+ * /proc and /sys which report a zero file size but can be read from. */
+
+ if (!S_ISREG(st.st_mode) || st.st_size < DATA_FD_MEMORY_LIMIT) {
+
+ /* Try a memfd first */
+ copy_fd = memfd_new("data-fd");
+ if (copy_fd >= 0) {
+ off_t f;
+
+ r = copy_bytes(fd, copy_fd, DATA_FD_MEMORY_LIMIT, 0);
+ if (r < 0)
+ return r;
+
+ f = lseek(copy_fd, 0, SEEK_SET);
+ if (f != 0)
+ return -errno;
+
+ if (r == 0) {
+ /* Did it fit into the limit? If so, we are done. */
+ r = memfd_set_sealed(copy_fd);
+ if (r < 0)
+ return r;
+
+ return TAKE_FD(copy_fd);
+ }
+
+ /* Hmm, pity, this didn't fit. Let's fall back to /tmp then, see below */
+
+ } else {
+ _cleanup_close_pair_ int pipefds[2] = EBADF_PAIR;
+ int isz;
+
+ /* If memfds aren't available, use a pipe. Set O_NONBLOCK so that we will get EAGAIN rather
+ * then block indefinitely when we hit the pipe size limit */
+
+ if (pipe2(pipefds, O_CLOEXEC|O_NONBLOCK) < 0)
+ return -errno;
+
+ isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
+ if (isz < 0)
+ return -errno;
+
+ /* Try to enlarge the pipe size if necessary */
+ if ((size_t) isz < DATA_FD_MEMORY_LIMIT) {
+
+ (void) fcntl(pipefds[1], F_SETPIPE_SZ, DATA_FD_MEMORY_LIMIT);
+
+ isz = fcntl(pipefds[1], F_GETPIPE_SZ, 0);
+ if (isz < 0)
+ return -errno;
+ }
+
+ if ((size_t) isz >= DATA_FD_MEMORY_LIMIT) {
+
+ r = copy_bytes_full(fd, pipefds[1], DATA_FD_MEMORY_LIMIT, 0, &remains, &remains_size, NULL, NULL);
+ if (r < 0 && r != -EAGAIN)
+ return r; /* If we get EAGAIN it could be because of the source or because of
+ * the destination fd, we can't know, as sendfile() and friends won't
+ * tell us. Hence, treat this as reason to fall back, just to be
+ * sure. */
+ if (r == 0) {
+ /* Everything fit in, yay! */
+ (void) fd_nonblock(pipefds[0], false);
+
+ return TAKE_FD(pipefds[0]);
+ }
+
+ /* Things didn't fit in. But we read data into the pipe, let's remember that, so that
+ * when writing the new file we incorporate this first. */
+ copy_fd = TAKE_FD(pipefds[0]);
+ }
+ }
+ }
+
+ /* If we have reason to believe this will fit fine in /tmp, then use that as first fallback. */
+ if ((!S_ISREG(st.st_mode) || st.st_size < DATA_FD_TMP_LIMIT) &&
+ (DATA_FD_MEMORY_LIMIT + remains_size) < DATA_FD_TMP_LIMIT) {
+ off_t f;
+
+ tmp_fd = open_tmpfile_unlinkable(NULL /* NULL as directory means /tmp */, O_RDWR|O_CLOEXEC);
+ if (tmp_fd < 0)
+ return tmp_fd;
+
+ if (copy_fd >= 0) {
+ /* If we tried a memfd/pipe first and it ended up being too large, then copy this into the
+ * temporary file first. */
+
+ r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, 0);
+ if (r < 0)
+ return r;
+
+ assert(r == 0);
+ }
+
+ if (remains_size > 0) {
+ /* If there were remaining bytes (i.e. read into memory, but not written out yet) from the
+ * failed copy operation, let's flush them out next. */
+
+ r = loop_write(tmp_fd, remains, remains_size);
+ if (r < 0)
+ return r;
+ }
+
+ r = copy_bytes(fd, tmp_fd, DATA_FD_TMP_LIMIT - DATA_FD_MEMORY_LIMIT - remains_size, COPY_REFLINK);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ goto finish; /* Yay, it fit in */
+
+ /* It didn't fit in. Let's not forget to use what we already used */
+ f = lseek(tmp_fd, 0, SEEK_SET);
+ if (f != 0)
+ return -errno;
+
+ close_and_replace(copy_fd, tmp_fd);
+
+ remains = mfree(remains);
+ remains_size = 0;
+ }
+
+ /* As last fallback use /var/tmp */
+ r = var_tmp_dir(&td);
+ if (r < 0)
+ return r;
+
+ tmp_fd = open_tmpfile_unlinkable(td, O_RDWR|O_CLOEXEC);
+ if (tmp_fd < 0)
+ return tmp_fd;
+
+ if (copy_fd >= 0) {
+ /* If we tried a memfd/pipe first, or a file in /tmp, and it ended up being too large, than copy this
+ * into the temporary file first. */
+ r = copy_bytes(copy_fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
+ if (r < 0)
+ return r;
+
+ assert(r == 0);
+ }
+
+ if (remains_size > 0) {
+ /* Then, copy in any read but not yet written bytes. */
+ r = loop_write(tmp_fd, remains, remains_size);
+ if (r < 0)
+ return r;
+ }
+
+ /* Copy in the rest */
+ r = copy_bytes(fd, tmp_fd, UINT64_MAX, COPY_REFLINK);
+ if (r < 0)
+ return r;
+
+ assert(r == 0);
+
+finish:
+ /* Now convert the O_RDWR file descriptor into an O_RDONLY one (and as side effect seek to the beginning of the
+ * file again */
+
+ return fd_reopen(tmp_fd, O_RDONLY|O_CLOEXEC);
+}
+
+int memfd_clone_fd(int fd, const char *name, int mode) {
+ _cleanup_close_ int mfd = -EBADF;
+ struct stat st;
+ bool ro, exec;
+ int r;
+
+ /* Creates a clone of a regular file in a memfd. Unlike copy_data_fd() this returns strictly a memfd
+ * (and if it can't it will fail). Thus the resulting fd is seekable, and definitely reports as
+ * S_ISREG. */
+
+ assert(fd >= 0);
+ assert(name);
+ assert(IN_SET(mode & O_ACCMODE, O_RDONLY, O_RDWR));
+ assert((mode & ~(O_RDONLY|O_RDWR|O_CLOEXEC)) == 0);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ ro = (mode & O_ACCMODE) == O_RDONLY;
+ exec = st.st_mode & 0111;
+
+ mfd = memfd_create_wrapper(name,
+ ((FLAGS_SET(mode, O_CLOEXEC) || ro) ? MFD_CLOEXEC : 0) |
+ (ro ? MFD_ALLOW_SEALING : 0) |
+ (exec ? MFD_EXEC : MFD_NOEXEC_SEAL));
+ if (mfd < 0)
+ return mfd;
+
+ r = copy_bytes(fd, mfd, UINT64_MAX, COPY_REFLINK);
+ if (r < 0)
+ return r;
+
+ if (ro) {
+ _cleanup_close_ int rfd = -EBADF;
+
+ r = memfd_set_sealed(mfd);
+ if (r < 0)
+ return r;
+
+ rfd = fd_reopen(mfd, mode);
+ if (rfd < 0)
+ return rfd;
+
+ return TAKE_FD(rfd);
+ }
+
+ off_t f = lseek(mfd, 0, SEEK_SET);
+ if (f < 0)
+ return -errno;
+
+ return TAKE_FD(mfd);
+}
diff --git a/src/shared/data-fd-util.h b/src/shared/data-fd-util.h
new file mode 100644
index 0000000..4f3d8b8
--- /dev/null
+++ b/src/shared/data-fd-util.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+
+enum {
+ ACQUIRE_NO_DEV_NULL = 1 << 0,
+ ACQUIRE_NO_MEMFD = 1 << 1,
+ ACQUIRE_NO_PIPE = 1 << 2,
+ ACQUIRE_NO_TMPFILE = 1 << 3,
+ ACQUIRE_NO_REGULAR = 1 << 4,
+};
+
+int acquire_data_fd(const void *data, size_t size, unsigned flags);
+int copy_data_fd(int fd);
+int memfd_clone_fd(int fd, const char *name, int mode);
diff --git a/src/shared/dev-setup.c b/src/shared/dev-setup.c
new file mode 100644
index 0000000..f7ed161
--- /dev/null
+++ b/src/shared/dev-setup.c
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "dev-setup.h"
+#include "fd-util.h"
+#include "label-util.h"
+#include "lock-util.h"
+#include "log.h"
+#include "mkdir-label.h"
+#include "nulstr-util.h"
+#include "path-util.h"
+#include "terminal-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+
+int lock_dev_console(void) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ fd = open_terminal("/dev/console", O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return fd;
+
+ r = lock_generic(fd, LOCK_BSD, LOCK_EX);
+ if (r < 0)
+ return log_error_errno(r, "Failed to lock /dev/console: %m");
+
+ return TAKE_FD(fd);
+}
+
+int dev_setup(const char *prefix, uid_t uid, gid_t gid) {
+ static const char symlinks[] =
+ "-/proc/kcore\0" "/dev/core\0"
+ "/proc/self/fd\0" "/dev/fd\0"
+ "/proc/self/fd/0\0" "/dev/stdin\0"
+ "/proc/self/fd/1\0" "/dev/stdout\0"
+ "/proc/self/fd/2\0" "/dev/stderr\0";
+
+ int r;
+
+ NULSTR_FOREACH_PAIR(j, k, symlinks) {
+ _cleanup_free_ char *link_name = NULL;
+ const char *n;
+
+ if (j[0] == '-') {
+ j++;
+
+ if (access(j, F_OK) < 0)
+ continue;
+ }
+
+ if (prefix) {
+ link_name = path_join(prefix, k);
+ if (!link_name)
+ return -ENOMEM;
+
+ n = link_name;
+ } else
+ n = k;
+
+ r = symlink_label(j, n);
+ if (r < 0)
+ log_debug_errno(r, "Failed to symlink %s to %s: %m", j, n);
+
+ if (uid != UID_INVALID || gid != GID_INVALID)
+ if (lchown(n, uid, gid) < 0)
+ log_debug_errno(errno, "Failed to chown %s: %m", n);
+ }
+
+ return 0;
+}
+
+int make_inaccessible_nodes(
+ const char *parent_dir,
+ uid_t uid,
+ gid_t gid) {
+
+ static const struct {
+ const char *name;
+ mode_t mode;
+ } table[] = {
+ { "inaccessible", S_IFDIR | 0755 },
+ { "inaccessible/reg", S_IFREG | 0000 },
+ { "inaccessible/dir", S_IFDIR | 0000 },
+ { "inaccessible/fifo", S_IFIFO | 0000 },
+ { "inaccessible/sock", S_IFSOCK | 0000 },
+
+ /* The following two are likely to fail if we lack the privs for it (for example in an userns
+ * environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibits creation of
+ * device nodes with a major/minor of 0). But that's entirely fine. Consumers of these files
+ * should implement falling back to use a different node then, for example
+ * <root>/inaccessible/sock, which is close enough in behaviour and semantics for most uses.
+ */
+ { "inaccessible/chr", S_IFCHR | 0000 },
+ { "inaccessible/blk", S_IFBLK | 0000 },
+ };
+
+ int r;
+
+ if (!parent_dir)
+ parent_dir = "/run/systemd";
+
+ BLOCK_WITH_UMASK(0000);
+
+ /* Set up inaccessible (and empty) file nodes of all types. This are used to as mount sources for over-mounting
+ * ("masking") file nodes that shall become inaccessible and empty for specific containers or services. We try
+ * to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the
+ * underlying file, i.e. in the best case we offer the same node type as the underlying node. */
+
+ for (size_t i = 0; i < ELEMENTSOF(table); i++) {
+ _cleanup_free_ char *path = NULL;
+
+ path = path_join(parent_dir, table[i].name);
+ if (!path)
+ return log_oom();
+
+ if (S_ISDIR(table[i].mode))
+ r = mkdir_label(path, table[i].mode & 07777);
+ else
+ r = mknod_label(path, table[i].mode, makedev(0, 0));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
+ continue;
+ }
+
+ if (uid != UID_INVALID || gid != GID_INVALID) {
+ if (lchown(path, uid, gid) < 0)
+ log_debug_errno(errno, "Failed to chown '%s': %m", path);
+ }
+ }
+
+ return 0;
+}
diff --git a/src/shared/dev-setup.h b/src/shared/dev-setup.h
new file mode 100644
index 0000000..5339bc4
--- /dev/null
+++ b/src/shared/dev-setup.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+int lock_dev_console(void);
+
+int dev_setup(const char *prefix, uid_t uid, gid_t gid);
+
+int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid);
diff --git a/src/shared/device-nodes.c b/src/shared/device-nodes.c
new file mode 100644
index 0000000..d08c40f
--- /dev/null
+++ b/src/shared/device-nodes.c
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+
+#include "device-nodes.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "utf8.h"
+
+int allow_listed_char_for_devnode(char c, const char *additional) {
+ return
+ ascii_isdigit(c) ||
+ ascii_isalpha(c) ||
+ strchr("#+-.:=@_", c) ||
+ (additional && strchr(additional, c));
+}
+
+int encode_devnode_name(const char *str, char *str_enc, size_t len) {
+ size_t i, j;
+
+ if (!str || !str_enc)
+ return -EINVAL;
+
+ for (i = 0, j = 0; str[i] != '\0'; i++) {
+ int seqlen;
+
+ seqlen = utf8_encoded_valid_unichar(str + i, SIZE_MAX);
+ if (seqlen > 1) {
+
+ if (len-j < (size_t) seqlen)
+ return -EINVAL;
+
+ memcpy(&str_enc[j], &str[i], seqlen);
+ j += seqlen;
+ i += (seqlen-1);
+
+ } else if (str[i] == '\\' || !allow_listed_char_for_devnode(str[i], NULL)) {
+
+ if (len-j < 4)
+ return -EINVAL;
+
+ sprintf(&str_enc[j], "\\x%02x", (unsigned char) str[i]);
+ j += 4;
+
+ } else {
+ if (len-j < 1)
+ return -EINVAL;
+
+ str_enc[j] = str[i];
+ j++;
+ }
+ }
+
+ if (len-j < 1)
+ return -EINVAL;
+
+ str_enc[j] = '\0';
+ return 0;
+}
+
+int devnode_same(const char *a, const char *b) {
+ struct stat sa, sb;
+
+ assert(a);
+ assert(b);
+
+ if (!valid_device_node_path(a) || !valid_device_node_path(b))
+ return -EINVAL;
+
+ if (stat(a, &sa) < 0)
+ return -errno;
+ if (stat(b, &sb) < 0)
+ return -errno;
+
+ if (!S_ISBLK(sa.st_mode) && !S_ISCHR(sa.st_mode))
+ return -ENODEV;
+ if (!S_ISBLK(sb.st_mode) && !S_ISCHR(sb.st_mode))
+ return -ENODEV;
+
+ if (((sa.st_mode ^ sb.st_mode) & S_IFMT) != 0) /* both inode same device node type? */
+ return false;
+
+ return sa.st_rdev == sb.st_rdev;
+}
diff --git a/src/shared/device-nodes.h b/src/shared/device-nodes.h
new file mode 100644
index 0000000..8b17a8e
--- /dev/null
+++ b/src/shared/device-nodes.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stddef.h>
+
+int encode_devnode_name(const char *str, char *str_enc, size_t len);
+int allow_listed_char_for_devnode(char c, const char *additional);
+
+int devnode_same(const char *a, const char *b);
diff --git a/src/shared/devnode-acl.c b/src/shared/devnode-acl.c
new file mode 100644
index 0000000..b239699
--- /dev/null
+++ b/src/shared/devnode-acl.c
@@ -0,0 +1,226 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "sd-device.h"
+
+#include "acl-util.h"
+#include "alloc-util.h"
+#include "device-util.h"
+#include "devnode-acl.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "set.h"
+#include "string-util.h"
+
+static int flush_acl(acl_t acl) {
+ acl_entry_t i;
+ int found;
+ bool changed = false;
+
+ assert(acl);
+
+ for (found = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
+ found > 0;
+ found = acl_get_entry(acl, ACL_NEXT_ENTRY, &i)) {
+
+ acl_tag_t tag;
+
+ if (acl_get_tag_type(i, &tag) < 0)
+ return -errno;
+
+ if (tag != ACL_USER)
+ continue;
+
+ if (acl_delete_entry(acl, i) < 0)
+ return -errno;
+
+ changed = true;
+ }
+
+ if (found < 0)
+ return -errno;
+
+ return changed;
+}
+
+int devnode_acl(const char *path,
+ bool flush,
+ bool del, uid_t old_uid,
+ bool add, uid_t new_uid) {
+
+ _cleanup_(acl_freep) acl_t acl = NULL;
+ int r;
+ bool changed = false;
+
+ assert(path);
+
+ acl = acl_get_file(path, ACL_TYPE_ACCESS);
+ if (!acl)
+ return -errno;
+
+ if (flush) {
+
+ r = flush_acl(acl);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ changed = true;
+
+ } else if (del && old_uid > 0) {
+ acl_entry_t entry;
+
+ r = acl_find_uid(acl, old_uid, &entry);
+ if (r < 0)
+ return r;
+
+ if (r > 0) {
+ if (acl_delete_entry(acl, entry) < 0)
+ return -errno;
+
+ changed = true;
+ }
+ }
+
+ if (add && new_uid > 0) {
+ acl_entry_t entry;
+ acl_permset_t permset;
+ int rd, wt;
+
+ r = acl_find_uid(acl, new_uid, &entry);
+ if (r < 0)
+ return r;
+
+ if (r == 0) {
+ if (acl_create_entry(&acl, &entry) < 0)
+ return -errno;
+
+ if (acl_set_tag_type(entry, ACL_USER) < 0 ||
+ acl_set_qualifier(entry, &new_uid) < 0)
+ return -errno;
+ }
+
+ if (acl_get_permset(entry, &permset) < 0)
+ return -errno;
+
+ rd = acl_get_perm(permset, ACL_READ);
+ if (rd < 0)
+ return -errno;
+
+ wt = acl_get_perm(permset, ACL_WRITE);
+ if (wt < 0)
+ return -errno;
+
+ if (!rd || !wt) {
+
+ if (acl_add_perm(permset, ACL_READ|ACL_WRITE) < 0)
+ return -errno;
+
+ changed = true;
+ }
+ }
+
+ if (!changed)
+ return 0;
+
+ if (acl_calc_mask(&acl) < 0)
+ return -errno;
+
+ if (acl_set_file(path, ACL_TYPE_ACCESS, acl) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int devnode_acl_all(const char *seat,
+ bool flush,
+ bool del, uid_t old_uid,
+ bool add, uid_t new_uid) {
+
+ _cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
+ _cleanup_set_free_ Set *nodes = NULL;
+ _cleanup_closedir_ DIR *dir = NULL;
+ char *n;
+ int r;
+
+ r = sd_device_enumerator_new(&e);
+ if (r < 0)
+ return r;
+
+ if (isempty(seat))
+ seat = "seat0";
+
+ /* We can only match by one tag in libudev. We choose
+ * "uaccess" for that. If we could match for two tags here we
+ * could add the seat name as second match tag, but this would
+ * be hardly optimizable in libudev, and hence checking the
+ * second tag manually in our loop is a good solution. */
+ r = sd_device_enumerator_add_match_tag(e, "uaccess");
+ if (r < 0)
+ return r;
+
+ FOREACH_DEVICE(e, d) {
+ const char *node, *sn;
+
+ /* Make sure the tag is still in place */
+ if (sd_device_has_current_tag(d, "uaccess") <= 0)
+ continue;
+
+ if (sd_device_get_property_value(d, "ID_SEAT", &sn) < 0 || isempty(sn))
+ sn = "seat0";
+
+ if (!streq(seat, sn))
+ continue;
+
+ /* In case people mistag devices with nodes, we need to ignore this */
+ if (sd_device_get_devname(d, &node) < 0)
+ continue;
+
+ log_device_debug(d, "Found udev node %s for seat %s", node, seat);
+ r = set_put_strdup_full(&nodes, &path_hash_ops_free, node);
+ if (r < 0)
+ return r;
+ }
+
+ /* udev exports "dead" device nodes to allow module on-demand loading,
+ * these devices are not known to the kernel at this moment */
+ dir = opendir("/run/udev/static_node-tags/uaccess");
+ if (dir) {
+ FOREACH_DIRENT(de, dir, return -errno) {
+ r = readlinkat_malloc(dirfd(dir), de->d_name, &n);
+ if (r == -ENOENT)
+ continue;
+ if (r < 0) {
+ log_debug_errno(r,
+ "Unable to read symlink '/run/udev/static_node-tags/uaccess/%s', ignoring: %m",
+ de->d_name);
+ continue;
+ }
+
+ log_debug("Found static node %s for seat %s", n, seat);
+ r = set_ensure_consume(&nodes, &path_hash_ops_free, n);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ r = 0;
+ SET_FOREACH(n, nodes) {
+ int k;
+
+ log_debug("Changing ACLs at %s for seat %s (uid "UID_FMT"%s"UID_FMT"%s%s)",
+ n, seat, old_uid, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), new_uid,
+ del ? " del" : "", add ? " add" : "");
+
+ k = devnode_acl(n, flush, del, old_uid, add, new_uid);
+ if (k == -ENOENT)
+ log_debug("Device %s disappeared while setting ACLs", n);
+ else
+ RET_GATHER(r, k);
+ }
+
+ return r;
+}
diff --git a/src/shared/devnode-acl.h b/src/shared/devnode-acl.h
new file mode 100644
index 0000000..c88f3c0
--- /dev/null
+++ b/src/shared/devnode-acl.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+#if HAVE_ACL
+
+int devnode_acl(const char *path,
+ bool flush,
+ bool del, uid_t old_uid,
+ bool add, uid_t new_uid);
+
+int devnode_acl_all(const char *seat,
+ bool flush,
+ bool del, uid_t old_uid,
+ bool add, uid_t new_uid);
+#else
+
+static inline int devnode_acl(const char *path,
+ bool flush,
+ bool del, uid_t old_uid,
+ bool add, uid_t new_uid) {
+ return 0;
+}
+
+static inline int devnode_acl_all(const char *seat,
+ bool flush,
+ bool del, uid_t old_uid,
+ bool add, uid_t new_uid) {
+ return 0;
+}
+
+#endif
diff --git a/src/shared/discover-image.c b/src/shared/discover-image.c
new file mode 100644
index 0000000..e8f4dfb
--- /dev/null
+++ b/src/shared/discover-image.c
@@ -0,0 +1,1385 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <linux/loop.h>
+#include <linux/magic.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "chase.h"
+#include "chattr-util.h"
+#include "copy.h"
+#include "dirent-util.h"
+#include "discover-image.h"
+#include "dissect-image.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "extension-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hashmap.h"
+#include "hostname-setup.h"
+#include "id128-util.h"
+#include "initrd-util.h"
+#include "lock-util.h"
+#include "log.h"
+#include "loop-util.h"
+#include "macro.h"
+#include "mkdir.h"
+#include "nulstr-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "time-util.h"
+#include "utf8.h"
+#include "xattr-util.h"
+
+static const char* const image_search_path[_IMAGE_CLASS_MAX] = {
+ [IMAGE_MACHINE] = "/etc/machines\0" /* only place symlinks here */
+ "/run/machines\0" /* and here too */
+ "/var/lib/machines\0" /* the main place for images */
+ "/var/lib/container\0" /* legacy */
+ "/usr/local/lib/machines\0"
+ "/usr/lib/machines\0",
+
+ [IMAGE_PORTABLE] = "/etc/portables\0" /* only place symlinks here */
+ "/run/portables\0" /* and here too */
+ "/var/lib/portables\0" /* the main place for images */
+ "/usr/local/lib/portables\0"
+ "/usr/lib/portables\0",
+
+ /* Note that we don't allow storing extensions under /usr/, unlike with other image types. That's
+ * because extension images are supposed to extend /usr/, so you get into recursive races, especially
+ * with directory-based extensions, as the kernel's OverlayFS explicitly checks for this and errors
+ * out with -ELOOP if it finds that a lowerdir= is a child of another lowerdir=. */
+ [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
+ "/run/extensions\0" /* and here too */
+ "/var/lib/extensions\0", /* the main place for images */
+
+ [IMAGE_CONFEXT] = "/run/confexts\0" /* only place symlinks here */
+ "/var/lib/confexts\0" /* the main place for images */
+ "/usr/local/lib/confexts\0"
+ "/usr/lib/confexts\0",
+};
+
+/* Inside the initrd, use a slightly different set of search path (i.e. include .extra/sysext in extension
+ * search dir) */
+static const char* const image_search_path_initrd[_IMAGE_CLASS_MAX] = {
+ /* (entries that aren't listed here will get the same search path as for the non initrd-case) */
+
+ [IMAGE_SYSEXT] = "/etc/extensions\0" /* only place symlinks here */
+ "/run/extensions\0" /* and here too */
+ "/var/lib/extensions\0" /* the main place for images */
+ "/.extra/sysext\0" /* put sysext picked up by systemd-stub last, since not trusted */
+};
+
+static const char* image_class_suffix_table[_IMAGE_CLASS_MAX] = {
+ [IMAGE_SYSEXT] = ".sysext",
+ [IMAGE_CONFEXT] = ".confext",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(image_class_suffix, ImageClass);
+
+static Image *image_free(Image *i) {
+ assert(i);
+
+ free(i->name);
+ free(i->path);
+
+ free(i->hostname);
+ strv_free(i->machine_info);
+ strv_free(i->os_release);
+ strv_free(i->sysext_release);
+ strv_free(i->confext_release);
+
+ return mfree(i);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(Image, image, image_free);
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(image_hash_ops, char, string_hash_func, string_compare_func,
+ Image, image_unref);
+
+static char **image_settings_path(Image *image) {
+ _cleanup_strv_free_ char **l = NULL;
+ _cleanup_free_ char *fn = NULL;
+ size_t i = 0;
+ int r;
+
+ assert(image);
+
+ l = new0(char*, 4);
+ if (!l)
+ return NULL;
+
+ fn = strjoin(image->name, ".nspawn");
+ if (!fn)
+ return NULL;
+
+ FOREACH_STRING(s, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
+ l[i] = path_join(s, fn);
+ if (!l[i])
+ return NULL;
+
+ i++;
+ }
+
+ r = file_in_same_dir(image->path, fn, l + i);
+ if (r == -ENOMEM)
+ return NULL;
+ if (r < 0)
+ log_debug_errno(r, "Failed to generate .nspawn settings path from image path, ignoring: %m");
+
+ strv_uniq(l);
+
+ return TAKE_PTR(l);
+}
+
+static int image_roothash_path(Image *image, char **ret) {
+ _cleanup_free_ char *fn = NULL;
+
+ assert(image);
+
+ fn = strjoin(image->name, ".roothash");
+ if (!fn)
+ return -ENOMEM;
+
+ return file_in_same_dir(image->path, fn, ret);
+}
+
+static int image_new(
+ ImageType t,
+ ImageClass c,
+ const char *pretty,
+ const char *path,
+ const char *filename,
+ bool read_only,
+ usec_t crtime,
+ usec_t mtime,
+ Image **ret) {
+
+ _cleanup_(image_unrefp) Image *i = NULL;
+
+ assert(t >= 0);
+ assert(t < _IMAGE_TYPE_MAX);
+ assert(pretty);
+ assert(filename);
+ assert(ret);
+
+ i = new(Image, 1);
+ if (!i)
+ return -ENOMEM;
+
+ *i = (Image) {
+ .n_ref = 1,
+ .type = t,
+ .class = c,
+ .read_only = read_only,
+ .crtime = crtime,
+ .mtime = mtime,
+ .usage = UINT64_MAX,
+ .usage_exclusive = UINT64_MAX,
+ .limit = UINT64_MAX,
+ .limit_exclusive = UINT64_MAX,
+ };
+
+ i->name = strdup(pretty);
+ if (!i->name)
+ return -ENOMEM;
+
+ i->path = path_join(path, filename);
+ if (!i->path)
+ return -ENOMEM;
+
+ path_simplify(i->path);
+
+ *ret = TAKE_PTR(i);
+
+ return 0;
+}
+
+static int extract_pretty(
+ const char *path,
+ const char *class_suffix,
+ const char *format_suffix,
+ char **ret) {
+
+ _cleanup_free_ char *name = NULL;
+ int r;
+
+ assert(path);
+ assert(ret);
+
+ r = path_extract_filename(path, &name);
+ if (r < 0)
+ return r;
+
+ if (format_suffix) {
+ char *e = endswith(name, format_suffix);
+ if (!e) /* Format suffix is required */
+ return -EINVAL;
+
+ *e = 0;
+ }
+
+ if (class_suffix) {
+ char *e = endswith(name, class_suffix);
+ if (e) /* Class suffix is optional */
+ *e = 0;
+ }
+
+ if (!image_name_is_valid(name))
+ return -EINVAL;
+
+ *ret = TAKE_PTR(name);
+ return 0;
+}
+
+static int image_make(
+ ImageClass c,
+ const char *pretty,
+ int dfd,
+ const char *path,
+ const char *filename,
+ const struct stat *st,
+ Image **ret) {
+
+ _cleanup_free_ char *pretty_buffer = NULL, *parent = NULL;
+ struct stat stbuf;
+ bool read_only;
+ int r;
+
+ assert(dfd >= 0 || dfd == AT_FDCWD);
+ assert(path || dfd == AT_FDCWD);
+ assert(filename);
+
+ /* We explicitly *do* follow symlinks here, since we want to allow symlinking trees, raw files and block
+ * devices into /var/lib/machines/, and treat them normally.
+ *
+ * This function returns -ENOENT if we can't find the image after all, and -EMEDIUMTYPE if it's not a file we
+ * recognize. */
+
+ if (!st) {
+ if (fstatat(dfd, filename, &stbuf, 0) < 0)
+ return -errno;
+
+ st = &stbuf;
+ }
+
+ if (!path) {
+ if (dfd == AT_FDCWD)
+ (void) safe_getcwd(&parent);
+ else
+ (void) fd_get_path(dfd, &parent);
+ }
+
+ read_only =
+ (path && path_startswith(path, "/usr")) ||
+ (faccessat(dfd, filename, W_OK, AT_EACCESS) < 0 && errno == EROFS);
+
+ if (S_ISDIR(st->st_mode)) {
+ _cleanup_close_ int fd = -EBADF;
+ unsigned file_attr = 0;
+ usec_t crtime = 0;
+
+ if (!ret)
+ return 0;
+
+ if (!pretty) {
+ r = extract_pretty(filename, image_class_suffix_to_string(c), NULL, &pretty_buffer);
+ if (r < 0)
+ return r;
+
+ pretty = pretty_buffer;
+ }
+
+ fd = openat(dfd, filename, O_CLOEXEC|O_NOCTTY|O_DIRECTORY);
+ if (fd < 0)
+ return -errno;
+
+ if (btrfs_might_be_subvol(st)) {
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ BtrfsSubvolInfo info;
+
+ /* It's a btrfs subvolume */
+
+ r = btrfs_subvol_get_info_fd(fd, 0, &info);
+ if (r < 0)
+ return r;
+
+ r = image_new(IMAGE_SUBVOLUME,
+ c,
+ pretty,
+ path,
+ filename,
+ info.read_only || read_only,
+ info.otime,
+ 0,
+ ret);
+ if (r < 0)
+ return r;
+
+ if (btrfs_quota_scan_ongoing(fd) == 0) {
+ BtrfsQuotaInfo quota;
+
+ r = btrfs_subvol_get_subtree_quota_fd(fd, 0, &quota);
+ if (r >= 0) {
+ (*ret)->usage = quota.referenced;
+ (*ret)->usage_exclusive = quota.exclusive;
+
+ (*ret)->limit = quota.referenced_max;
+ (*ret)->limit_exclusive = quota.exclusive_max;
+ }
+ }
+
+ return 0;
+ }
+ }
+
+ /* Get directory creation time (not available everywhere, but that's OK */
+ (void) fd_getcrtime(fd, &crtime);
+
+ /* If the IMMUTABLE bit is set, we consider the directory read-only. Since the ioctl is not
+ * supported everywhere we ignore failures. */
+ (void) read_attr_fd(fd, &file_attr);
+
+ /* It's just a normal directory. */
+ r = image_new(IMAGE_DIRECTORY,
+ c,
+ pretty,
+ path,
+ filename,
+ read_only || (file_attr & FS_IMMUTABLE_FL),
+ crtime,
+ 0, /* we don't use mtime of stat() here, since it's not the time of last change of the tree, but only of the top-level dir */
+ ret);
+ if (r < 0)
+ return r;
+
+ return 0;
+
+ } else if (S_ISREG(st->st_mode) && endswith(filename, ".raw")) {
+ usec_t crtime = 0;
+
+ /* It's a RAW disk image */
+
+ if (!ret)
+ return 0;
+
+ (void) fd_getcrtime_at(dfd, filename, AT_SYMLINK_FOLLOW, &crtime);
+
+ if (!pretty) {
+ r = extract_pretty(filename, image_class_suffix_to_string(c), ".raw", &pretty_buffer);
+ if (r < 0)
+ return r;
+
+ pretty = pretty_buffer;
+ }
+
+ r = image_new(IMAGE_RAW,
+ c,
+ pretty,
+ path,
+ filename,
+ !(st->st_mode & 0222) || read_only,
+ crtime,
+ timespec_load(&st->st_mtim),
+ ret);
+ if (r < 0)
+ return r;
+
+ (*ret)->usage = (*ret)->usage_exclusive = st->st_blocks * 512;
+ (*ret)->limit = (*ret)->limit_exclusive = st->st_size;
+
+ return 0;
+
+ } else if (S_ISBLK(st->st_mode)) {
+ _cleanup_close_ int block_fd = -EBADF;
+ uint64_t size = UINT64_MAX;
+
+ /* A block device */
+
+ if (!ret)
+ return 0;
+
+ if (!pretty) {
+ r = extract_pretty(filename, NULL, NULL, &pretty_buffer);
+ if (r < 0)
+ return r;
+
+ pretty = pretty_buffer;
+ }
+
+ block_fd = openat(dfd, filename, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
+ if (block_fd < 0)
+ log_debug_errno(errno, "Failed to open block device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
+ else {
+ /* Refresh stat data after opening the node */
+ if (fstat(block_fd, &stbuf) < 0)
+ return -errno;
+ st = &stbuf;
+
+ if (!S_ISBLK(st->st_mode)) /* Verify that what we opened is actually what we think it is */
+ return -ENOTTY;
+
+ if (!read_only) {
+ int state = 0;
+
+ if (ioctl(block_fd, BLKROGET, &state) < 0)
+ log_debug_errno(errno, "Failed to issue BLKROGET on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
+ else if (state)
+ read_only = true;
+ }
+
+ if (ioctl(block_fd, BLKGETSIZE64, &size) < 0)
+ log_debug_errno(errno, "Failed to issue BLKGETSIZE64 on device %s/%s, ignoring: %m", path ?: strnull(parent), filename);
+
+ block_fd = safe_close(block_fd);
+ }
+
+ r = image_new(IMAGE_BLOCK,
+ c,
+ pretty,
+ path,
+ filename,
+ !(st->st_mode & 0222) || read_only,
+ 0,
+ 0,
+ ret);
+ if (r < 0)
+ return r;
+
+ if (!IN_SET(size, 0, UINT64_MAX))
+ (*ret)->usage = (*ret)->usage_exclusive = (*ret)->limit = (*ret)->limit_exclusive = size;
+
+ return 0;
+ }
+
+ return -EMEDIUMTYPE;
+}
+
+static const char *pick_image_search_path(ImageClass class) {
+ if (class < 0 || class >= _IMAGE_CLASS_MAX)
+ return NULL;
+
+ /* Use the initrd search path if there is one, otherwise use the common one */
+ return in_initrd() && image_search_path_initrd[class] ? image_search_path_initrd[class] : image_search_path[class];
+}
+
+int image_find(ImageClass class,
+ const char *name,
+ const char *root,
+ Image **ret) {
+
+ int r;
+
+ assert(class >= 0);
+ assert(class < _IMAGE_CLASS_MAX);
+ assert(name);
+
+ /* There are no images with invalid names */
+ if (!image_name_is_valid(name))
+ return -ENOENT;
+
+ NULSTR_FOREACH(path, pick_image_search_path(class)) {
+ _cleanup_free_ char *resolved = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ struct stat st;
+ int flags;
+
+ r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
+ if (r == -ENOENT)
+ continue;
+ if (r < 0)
+ return r;
+
+ /* As mentioned above, we follow symlinks on this fstatat(), because we want to permit people
+ * to symlink block devices into the search path. (For now, we disable that when operating
+ * relative to some root directory.) */
+ flags = root ? AT_SYMLINK_NOFOLLOW : 0;
+ if (fstatat(dirfd(d), name, &st, flags) < 0) {
+ _cleanup_free_ char *raw = NULL;
+
+ if (errno != ENOENT)
+ return -errno;
+
+ raw = strjoin(name, ".raw");
+ if (!raw)
+ return -ENOMEM;
+
+ if (fstatat(dirfd(d), raw, &st, flags) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ if (!S_ISREG(st.st_mode))
+ continue;
+
+ r = image_make(class, name, dirfd(d), resolved, raw, &st, ret);
+
+ } else {
+ if (!S_ISDIR(st.st_mode) && !S_ISBLK(st.st_mode))
+ continue;
+
+ r = image_make(class, name, dirfd(d), resolved, name, &st, ret);
+ }
+ if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
+ continue;
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->discoverable = true;
+
+ return 1;
+ }
+
+ if (class == IMAGE_MACHINE && streq(name, ".host")) {
+ r = image_make(class, ".host", AT_FDCWD, NULL, empty_to_root(root), NULL, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->discoverable = true;
+
+ return r;
+ }
+
+ return -ENOENT;
+};
+
+int image_from_path(const char *path, Image **ret) {
+
+ /* Note that we don't set the 'discoverable' field of the returned object, because we don't check here whether
+ * the image is in the image search path. And if it is we don't know if the path we used is actually not
+ * overridden by another, different image earlier in the search path */
+
+ if (path_equal(path, "/"))
+ return image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, "/", NULL, ret);
+
+ return image_make(_IMAGE_CLASS_INVALID, NULL, AT_FDCWD, NULL, path, NULL, ret);
+}
+
+int image_find_harder(ImageClass class, const char *name_or_path, const char *root, Image **ret) {
+ if (image_name_is_valid(name_or_path))
+ return image_find(class, name_or_path, root, ret);
+
+ return image_from_path(name_or_path, ret);
+}
+
+int image_discover(
+ ImageClass class,
+ const char *root,
+ Hashmap *h) {
+
+ int r;
+
+ assert(class >= 0);
+ assert(class < _IMAGE_CLASS_MAX);
+ assert(h);
+
+ NULSTR_FOREACH(path, pick_image_search_path(class)) {
+ _cleanup_free_ char *resolved = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+
+ r = chase_and_opendir(path, root, CHASE_PREFIX_ROOT, &resolved, &d);
+ if (r == -ENOENT)
+ continue;
+ if (r < 0)
+ return r;
+
+ FOREACH_DIRENT_ALL(de, d, return -errno) {
+ _cleanup_(image_unrefp) Image *image = NULL;
+ _cleanup_free_ char *pretty = NULL;
+ struct stat st;
+ int flags;
+
+ if (dot_or_dot_dot(de->d_name))
+ continue;
+
+ /* As mentioned above, we follow symlinks on this fstatat(), because we want to
+ * permit people to symlink block devices into the search path. */
+ flags = root ? AT_SYMLINK_NOFOLLOW : 0;
+ if (fstatat(dirfd(d), de->d_name, &st, flags) < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ if (S_ISREG(st.st_mode))
+ r = extract_pretty(de->d_name, image_class_suffix_to_string(class), ".raw", &pretty);
+ else if (S_ISDIR(st.st_mode))
+ r = extract_pretty(de->d_name, image_class_suffix_to_string(class), NULL, &pretty);
+ else if (S_ISBLK(st.st_mode))
+ r = extract_pretty(de->d_name, NULL, NULL, &pretty);
+ else {
+ log_debug("Skipping directory entry '%s', which is neither regular file, directory nor block device.", de->d_name);
+ continue;
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Skipping directory entry '%s', which doesn't look like an image.", de->d_name);
+ continue;
+ }
+
+ if (hashmap_contains(h, pretty))
+ continue;
+
+ r = image_make(class, pretty, dirfd(d), resolved, de->d_name, &st, &image);
+ if (IN_SET(r, -ENOENT, -EMEDIUMTYPE))
+ continue;
+ if (r < 0)
+ return r;
+
+ image->discoverable = true;
+
+ r = hashmap_put(h, image->name, image);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(image);
+ }
+ }
+
+ if (class == IMAGE_MACHINE && !hashmap_contains(h, ".host")) {
+ _cleanup_(image_unrefp) Image *image = NULL;
+
+ r = image_make(IMAGE_MACHINE, ".host", AT_FDCWD, NULL, empty_to_root("/"), NULL, &image);
+ if (r < 0)
+ return r;
+
+ image->discoverable = true;
+
+ r = hashmap_put(h, image->name, image);
+ if (r < 0)
+ return r;
+
+ image = NULL;
+ }
+
+ return 0;
+}
+
+int image_remove(Image *i) {
+ _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
+ _cleanup_strv_free_ char **settings = NULL;
+ _cleanup_free_ char *roothash = NULL;
+ int r;
+
+ assert(i);
+
+ if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
+ return -EROFS;
+
+ settings = image_settings_path(i);
+ if (!settings)
+ return -ENOMEM;
+
+ r = image_roothash_path(i, &roothash);
+ if (r < 0)
+ return r;
+
+ /* Make sure we don't interfere with a running nspawn */
+ r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
+ if (r < 0)
+ return r;
+
+ switch (i->type) {
+
+ case IMAGE_SUBVOLUME:
+
+ /* Let's unlink first, maybe it is a symlink? If that works we are happy. Otherwise, let's get out the
+ * big guns */
+ if (unlink(i->path) < 0) {
+ r = btrfs_subvol_remove(i->path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
+ if (r < 0)
+ return r;
+ }
+
+ break;
+
+ case IMAGE_DIRECTORY:
+ /* Allow deletion of read-only directories */
+ (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
+ r = rm_rf(i->path, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
+ if (r < 0)
+ return r;
+
+ break;
+
+ case IMAGE_BLOCK:
+
+ /* If this is inside of /dev, then it's a real block device, hence let's not touch the device node
+ * itself (but let's remove the stuff stored alongside it). If it's anywhere else, let's try to unlink
+ * the thing (it's most likely a symlink after all). */
+
+ if (path_startswith(i->path, "/dev"))
+ break;
+
+ _fallthrough_;
+ case IMAGE_RAW:
+ if (unlink(i->path) < 0)
+ return -errno;
+ break;
+
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ STRV_FOREACH(j, settings)
+ if (unlink(*j) < 0 && errno != ENOENT)
+ log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", *j);
+
+ if (unlink(roothash) < 0 && errno != ENOENT)
+ log_debug_errno(errno, "Failed to unlink %s, ignoring: %m", roothash);
+
+ return 0;
+}
+
+static int rename_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
+ _cleanup_free_ char *fn = NULL, *rs = NULL;
+ int r;
+
+ fn = strjoin(new_name, suffix);
+ if (!fn)
+ return -ENOMEM;
+
+ r = file_in_same_dir(path, fn, &rs);
+ if (r < 0)
+ return r;
+
+ return rename_noreplace(AT_FDCWD, path, AT_FDCWD, rs);
+}
+
+int image_rename(Image *i, const char *new_name) {
+ _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
+ _cleanup_free_ char *new_path = NULL, *nn = NULL, *roothash = NULL;
+ _cleanup_strv_free_ char **settings = NULL;
+ unsigned file_attr = 0;
+ int r;
+
+ assert(i);
+
+ if (!image_name_is_valid(new_name))
+ return -EINVAL;
+
+ if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
+ return -EROFS;
+
+ settings = image_settings_path(i);
+ if (!settings)
+ return -ENOMEM;
+
+ r = image_roothash_path(i, &roothash);
+ if (r < 0)
+ return r;
+
+ /* Make sure we don't interfere with a running nspawn */
+ r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
+ if (r < 0)
+ return r;
+
+ /* Make sure nobody takes the new name, between the time we
+ * checked it is currently unused in all search paths, and the
+ * time we take possession of it */
+ r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
+ if (r < 0)
+ return r;
+
+ r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
+ if (r >= 0)
+ return -EEXIST;
+ if (r != -ENOENT)
+ return r;
+
+ switch (i->type) {
+
+ case IMAGE_DIRECTORY:
+ /* Turn of the immutable bit while we rename the image, so that we can rename it */
+ (void) read_attr_path(i->path, &file_attr);
+
+ if (file_attr & FS_IMMUTABLE_FL)
+ (void) chattr_path(i->path, 0, FS_IMMUTABLE_FL, NULL);
+
+ _fallthrough_;
+ case IMAGE_SUBVOLUME:
+ r = file_in_same_dir(i->path, new_name, &new_path);
+ break;
+
+ case IMAGE_BLOCK:
+
+ /* Refuse renaming raw block devices in /dev, the names are picked by udev after all. */
+ if (path_startswith(i->path, "/dev"))
+ return -EROFS;
+
+ r = file_in_same_dir(i->path, new_name, &new_path);
+ break;
+
+ case IMAGE_RAW: {
+ const char *fn;
+
+ fn = strjoina(new_name, ".raw");
+
+ r = file_in_same_dir(i->path, fn, &new_path);
+ break;
+ }
+
+ default:
+ return -EOPNOTSUPP;
+ }
+ if (r < 0)
+ return r;
+
+ nn = strdup(new_name);
+ if (!nn)
+ return -ENOMEM;
+
+ r = rename_noreplace(AT_FDCWD, i->path, AT_FDCWD, new_path);
+ if (r < 0)
+ return r;
+
+ /* Restore the immutable bit, if it was set before */
+ if (file_attr & FS_IMMUTABLE_FL)
+ (void) chattr_path(new_path, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL);
+
+ free_and_replace(i->path, new_path);
+ free_and_replace(i->name, nn);
+
+ STRV_FOREACH(j, settings) {
+ r = rename_auxiliary_file(*j, new_name, ".nspawn");
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to rename settings file %s, ignoring: %m", *j);
+ }
+
+ r = rename_auxiliary_file(roothash, new_name, ".roothash");
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to rename roothash file %s, ignoring: %m", roothash);
+
+ return 0;
+}
+
+static int clone_auxiliary_file(const char *path, const char *new_name, const char *suffix) {
+ _cleanup_free_ char *fn = NULL, *rs = NULL;
+ int r;
+
+ fn = strjoin(new_name, suffix);
+ if (!fn)
+ return -ENOMEM;
+
+ r = file_in_same_dir(path, fn, &rs);
+ if (r < 0)
+ return r;
+
+ return copy_file_atomic(path, rs, 0664, COPY_REFLINK);
+}
+
+int image_clone(Image *i, const char *new_name, bool read_only) {
+ _cleanup_(release_lock_file) LockFile name_lock = LOCK_FILE_INIT;
+ _cleanup_strv_free_ char **settings = NULL;
+ _cleanup_free_ char *roothash = NULL;
+ const char *new_path;
+ int r;
+
+ assert(i);
+
+ if (!image_name_is_valid(new_name))
+ return -EINVAL;
+
+ settings = image_settings_path(i);
+ if (!settings)
+ return -ENOMEM;
+
+ r = image_roothash_path(i, &roothash);
+ if (r < 0)
+ return r;
+
+ /* Make sure nobody takes the new name, between the time we
+ * checked it is currently unused in all search paths, and the
+ * time we take possession of it */
+ r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
+ if (r < 0)
+ return r;
+
+ r = image_find(IMAGE_MACHINE, new_name, NULL, NULL);
+ if (r >= 0)
+ return -EEXIST;
+ if (r != -ENOENT)
+ return r;
+
+ switch (i->type) {
+
+ case IMAGE_SUBVOLUME:
+ case IMAGE_DIRECTORY:
+ /* If we can we'll always try to create a new btrfs subvolume here, even if the source is a plain
+ * directory. */
+
+ new_path = strjoina("/var/lib/machines/", new_name);
+
+ r = btrfs_subvol_snapshot_at(AT_FDCWD, i->path, AT_FDCWD, new_path,
+ (read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
+ BTRFS_SNAPSHOT_FALLBACK_COPY |
+ BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
+ BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
+ BTRFS_SNAPSHOT_RECURSIVE |
+ BTRFS_SNAPSHOT_QUOTA);
+ if (r >= 0)
+ /* Enable "subtree" quotas for the copy, if we didn't copy any quota from the source. */
+ (void) btrfs_subvol_auto_qgroup(new_path, 0, true);
+
+ break;
+
+ case IMAGE_RAW:
+ new_path = strjoina("/var/lib/machines/", new_name, ".raw");
+
+ r = copy_file_atomic_full(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, FS_NOCOW_FL,
+ COPY_REFLINK|COPY_CRTIME, NULL, NULL);
+ break;
+
+ case IMAGE_BLOCK:
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(j, settings) {
+ r = clone_auxiliary_file(*j, new_name, ".nspawn");
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to clone settings %s, ignoring: %m", *j);
+ }
+
+ r = clone_auxiliary_file(roothash, new_name, ".roothash");
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to clone root hash file %s, ignoring: %m", roothash);
+
+ return 0;
+}
+
+int image_read_only(Image *i, bool b) {
+ _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
+ int r;
+
+ assert(i);
+
+ if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
+ return -EROFS;
+
+ /* Make sure we don't interfere with a running nspawn */
+ r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
+ if (r < 0)
+ return r;
+
+ switch (i->type) {
+
+ case IMAGE_SUBVOLUME:
+
+ /* Note that we set the flag only on the top-level
+ * subvolume of the image. */
+
+ r = btrfs_subvol_set_read_only(i->path, b);
+ if (r < 0)
+ return r;
+
+ break;
+
+ case IMAGE_DIRECTORY:
+ /* For simple directory trees we cannot use the access
+ mode of the top-level directory, since it has an
+ effect on the container itself. However, we can
+ use the "immutable" flag, to at least make the
+ top-level directory read-only. It's not as good as
+ a read-only subvolume, but at least something, and
+ we can read the value back. */
+
+ r = chattr_path(i->path, b ? FS_IMMUTABLE_FL : 0, FS_IMMUTABLE_FL, NULL);
+ if (r < 0)
+ return r;
+
+ break;
+
+ case IMAGE_RAW: {
+ struct stat st;
+
+ if (stat(i->path, &st) < 0)
+ return -errno;
+
+ if (chmod(i->path, (st.st_mode & 0444) | (b ? 0000 : 0200)) < 0)
+ return -errno;
+
+ /* If the images is now read-only, it's a good time to
+ * defrag it, given that no write patterns will
+ * fragment it again. */
+ if (b)
+ (void) btrfs_defrag(i->path);
+ break;
+ }
+
+ case IMAGE_BLOCK: {
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+ int state = b;
+
+ fd = open(i->path, O_CLOEXEC|O_RDONLY|O_NONBLOCK|O_NOCTTY);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+ if (!S_ISBLK(st.st_mode))
+ return -ENOTTY;
+
+ if (ioctl(fd, BLKROSET, &state) < 0)
+ return -errno;
+
+ break;
+ }
+
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void make_lock_dir(void) {
+ (void) mkdir_p("/run/systemd/nspawn", 0755);
+ (void) mkdir("/run/systemd/nspawn/locks", 0700);
+}
+
+int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
+ _cleanup_free_ char *p = NULL;
+ LockFile t = LOCK_FILE_INIT;
+ struct stat st;
+ bool exclusive;
+ int r;
+
+ assert(path);
+ assert(global);
+ assert(local);
+
+ /* Locks an image path. This actually creates two locks: one "local" one, next to the image path
+ * itself, which might be shared via NFS. And another "global" one, in /run, that uses the
+ * device/inode number. This has the benefit that we can even lock a tree that is a mount point,
+ * correctly. */
+
+ if (!path_is_absolute(path))
+ return -EINVAL;
+
+ switch (operation & (LOCK_SH|LOCK_EX)) {
+ case LOCK_SH:
+ exclusive = false;
+ break;
+ case LOCK_EX:
+ exclusive = true;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
+ *local = *global = (LockFile) LOCK_FILE_INIT;
+ return 0;
+ }
+
+ /* Prohibit taking exclusive locks on the host image. We can't allow this, since we ourselves are
+ * running off it after all, and we don't want any images to manipulate the host image. We make an
+ * exception for shared locks however: we allow those (and make them NOPs since there's no point in
+ * taking them if there can't be exclusive locks). Strictly speaking these are questionable as well,
+ * since it means changes made to the host might propagate to the container as they happen (and a
+ * shared lock kinda suggests that no changes happen at all while it is in place), but it's too
+ * useful not to allow read-only containers off the host root, hence let's support this, and trust
+ * the user to do the right thing with this. */
+ if (path_equal(path, "/")) {
+ if (exclusive)
+ return -EBUSY;
+
+ *local = *global = (LockFile) LOCK_FILE_INIT;
+ return 0;
+ }
+
+ if (stat(path, &st) >= 0) {
+ if (S_ISBLK(st.st_mode))
+ r = asprintf(&p, "/run/systemd/nspawn/locks/block-%u:%u", major(st.st_rdev), minor(st.st_rdev));
+ else if (S_ISDIR(st.st_mode) || S_ISREG(st.st_mode))
+ r = asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino);
+ else
+ return -ENOTTY;
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ /* For block devices we don't need the "local" lock, as the major/minor lock above should be
+ * sufficient, since block devices are host local anyway. */
+ if (!path_startswith(path, "/dev/")) {
+ r = make_lock_file_for(path, operation, &t);
+ if (r < 0) {
+ if (!exclusive && r == -EROFS)
+ log_debug_errno(r, "Failed to create shared lock for '%s', ignoring: %m", path);
+ else
+ return r;
+ }
+ }
+
+ if (p) {
+ make_lock_dir();
+
+ r = make_lock_file(p, operation, global);
+ if (r < 0) {
+ release_lock_file(&t);
+ return r;
+ }
+ } else
+ *global = (LockFile) LOCK_FILE_INIT;
+
+ *local = t;
+ return 0;
+}
+
+int image_set_limit(Image *i, uint64_t referenced_max) {
+ assert(i);
+
+ if (IMAGE_IS_VENDOR(i) || IMAGE_IS_HOST(i))
+ return -EROFS;
+
+ if (i->type != IMAGE_SUBVOLUME)
+ return -EOPNOTSUPP;
+
+ /* We set the quota both for the subvolume as well as for the
+ * subtree. The latter is mostly for historical reasons, since
+ * we didn't use to have a concept of subtree quota, and hence
+ * only modified the subvolume quota. */
+
+ (void) btrfs_qgroup_set_limit(i->path, 0, referenced_max);
+ (void) btrfs_subvol_auto_qgroup(i->path, 0, true);
+ return btrfs_subvol_set_subtree_quota_limit(i->path, 0, referenced_max);
+}
+
+int image_read_metadata(Image *i, const ImagePolicy *image_policy) {
+ _cleanup_(release_lock_file) LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
+ int r;
+
+ assert(i);
+
+ r = image_path_lock(i->path, LOCK_SH|LOCK_NB, &global_lock, &local_lock);
+ if (r < 0)
+ return r;
+
+ switch (i->type) {
+
+ case IMAGE_SUBVOLUME:
+ case IMAGE_DIRECTORY: {
+ _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **sysext_release = NULL, **confext_release = NULL;
+ _cleanup_free_ char *hostname = NULL, *path = NULL;
+ sd_id128_t machine_id = SD_ID128_NULL;
+
+ if (i->class == IMAGE_SYSEXT) {
+ r = extension_has_forbidden_content(i->path);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
+ "Conflicting content found in image %s, refusing.",
+ i->name);
+ }
+
+ r = chase("/etc/hostname", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to chase /etc/hostname in image %s: %m", i->name);
+ else if (r >= 0) {
+ r = read_etc_hostname(path, &hostname);
+ if (r < 0)
+ log_debug_errno(errno, "Failed to read /etc/hostname of image %s: %m", i->name);
+ }
+
+ path = mfree(path);
+
+ r = id128_get_machine(i->path, &machine_id);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read machine ID in image %s, ignoring: %m", i->name);
+
+ r = chase("/etc/machine-info", i->path, CHASE_PREFIX_ROOT|CHASE_TRAIL_SLASH, &path, NULL);
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to chase /etc/machine-info in image %s: %m", i->name);
+ else if (r >= 0) {
+ r = load_env_file_pairs(NULL, path, &machine_info);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse machine-info data of %s: %m", i->name);
+ }
+
+ r = load_os_release_pairs(i->path, &os_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read os-release in image, ignoring: %m");
+
+ r = load_extension_release_pairs(i->path, IMAGE_SYSEXT, i->name, /* relax_extension_release_check= */ false, &sysext_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read sysext-release in image, ignoring: %m");
+
+ r = load_extension_release_pairs(i->path, IMAGE_CONFEXT, i->name, /* relax_extension_release_check= */ false, &confext_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read confext-release in image, ignoring: %m");
+
+ free_and_replace(i->hostname, hostname);
+ i->machine_id = machine_id;
+ strv_free_and_replace(i->machine_info, machine_info);
+ strv_free_and_replace(i->os_release, os_release);
+ strv_free_and_replace(i->sysext_release, sysext_release);
+ strv_free_and_replace(i->confext_release, confext_release);
+ break;
+ }
+
+ case IMAGE_RAW:
+ case IMAGE_BLOCK: {
+ _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
+ _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
+
+ r = loop_device_make_by_path(i->path, O_RDONLY, /* sector_size= */ UINT32_MAX, LO_FLAGS_PARTSCAN, LOCK_SH, &d);
+ if (r < 0)
+ return r;
+
+ r = dissect_loop_device(
+ d,
+ /* verity= */ NULL,
+ /* mount_options= */ NULL,
+ image_policy,
+ DISSECT_IMAGE_GENERIC_ROOT |
+ DISSECT_IMAGE_REQUIRE_ROOT |
+ DISSECT_IMAGE_RELAX_VAR_CHECK |
+ DISSECT_IMAGE_READ_ONLY |
+ DISSECT_IMAGE_USR_NO_ROOT |
+ DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES,
+ &m);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_acquire_metadata(m,
+ DISSECT_IMAGE_VALIDATE_OS |
+ DISSECT_IMAGE_VALIDATE_OS_EXT);
+ if (r < 0)
+ return r;
+
+ free_and_replace(i->hostname, m->hostname);
+ i->machine_id = m->machine_id;
+ strv_free_and_replace(i->machine_info, m->machine_info);
+ strv_free_and_replace(i->os_release, m->os_release);
+ strv_free_and_replace(i->sysext_release, m->sysext_release);
+ strv_free_and_replace(i->confext_release, m->confext_release);
+
+ break;
+ }
+
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ i->metadata_valid = true;
+
+ return 0;
+}
+
+int image_name_lock(const char *name, int operation, LockFile *ret) {
+ const char *p;
+
+ assert(name);
+ assert(ret);
+
+ /* Locks an image name, regardless of the precise path used. */
+
+ if (streq(name, ".host"))
+ return -EBUSY;
+
+ if (!image_name_is_valid(name))
+ return -EINVAL;
+
+ if (getenv_bool("SYSTEMD_NSPAWN_LOCK") == 0) {
+ *ret = (LockFile) LOCK_FILE_INIT;
+ return 0;
+ }
+
+ make_lock_dir();
+
+ p = strjoina("/run/systemd/nspawn/locks/name-", name);
+ return make_lock_file(p, operation, ret);
+}
+
+bool image_in_search_path(
+ ImageClass class,
+ const char *root,
+ const char *image) {
+
+ assert(image);
+
+ NULSTR_FOREACH(path, pick_image_search_path(class)) {
+ const char *p, *q;
+ size_t k;
+
+ if (!empty_or_root(root)) {
+ q = path_startswith(path, root);
+ if (!q)
+ continue;
+ } else
+ q = path;
+
+ p = path_startswith(q, path);
+ if (!p)
+ continue;
+
+ /* Make sure there's a filename following */
+ k = strcspn(p, "/");
+ if (k == 0)
+ continue;
+
+ p += k;
+
+ /* Accept trailing slashes */
+ if (p[strspn(p, "/")] == 0)
+ return true;
+ }
+
+ return false;
+}
+
+int image_to_json(const struct Image *img, JsonVariant **ret) {
+ assert(img);
+
+ return json_build(ret,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_STRING("Type", image_type_to_string(img->type)),
+ JSON_BUILD_PAIR_STRING("Class", image_class_to_string(img->class)),
+ JSON_BUILD_PAIR_STRING("Name", img->name),
+ JSON_BUILD_PAIR_CONDITION(img->path, "Path", JSON_BUILD_STRING(img->path)),
+ JSON_BUILD_PAIR_BOOLEAN("ReadOnly", img->read_only),
+ JSON_BUILD_PAIR_CONDITION(img->crtime != 0, "CreationTimestamp", JSON_BUILD_UNSIGNED(img->crtime)),
+ JSON_BUILD_PAIR_CONDITION(img->mtime != 0, "ModificationTimestamp", JSON_BUILD_UNSIGNED(img->mtime)),
+ JSON_BUILD_PAIR_CONDITION(img->usage != UINT64_MAX, "Usage", JSON_BUILD_UNSIGNED(img->usage)),
+ JSON_BUILD_PAIR_CONDITION(img->usage_exclusive != UINT64_MAX, "UsageExclusive", JSON_BUILD_UNSIGNED(img->usage_exclusive)),
+ JSON_BUILD_PAIR_CONDITION(img->limit != UINT64_MAX, "Limit", JSON_BUILD_UNSIGNED(img->limit)),
+ JSON_BUILD_PAIR_CONDITION(img->limit_exclusive != UINT64_MAX, "LimitExclusive", JSON_BUILD_UNSIGNED(img->limit_exclusive))));
+}
+
+static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
+ [IMAGE_DIRECTORY] = "directory",
+ [IMAGE_SUBVOLUME] = "subvolume",
+ [IMAGE_RAW] = "raw",
+ [IMAGE_BLOCK] = "block",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(image_type, ImageType);
diff --git a/src/shared/discover-image.h b/src/shared/discover-image.h
new file mode 100644
index 0000000..a30a3d9
--- /dev/null
+++ b/src/shared/discover-image.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "sd-id128.h"
+
+#include "hashmap.h"
+#include "image-policy.h"
+#include "json.h"
+#include "lock-util.h"
+#include "macro.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "time-util.h"
+
+typedef enum ImageType {
+ IMAGE_DIRECTORY,
+ IMAGE_SUBVOLUME,
+ IMAGE_RAW,
+ IMAGE_BLOCK,
+ _IMAGE_TYPE_MAX,
+ _IMAGE_TYPE_INVALID = -EINVAL,
+} ImageType;
+
+typedef struct Image {
+ unsigned n_ref;
+
+ ImageType type;
+ ImageClass class;
+ char *name;
+ char *path;
+ bool read_only;
+
+ usec_t crtime;
+ usec_t mtime;
+
+ uint64_t usage;
+ uint64_t usage_exclusive;
+ uint64_t limit;
+ uint64_t limit_exclusive;
+
+ char *hostname;
+ sd_id128_t machine_id;
+ char **machine_info;
+ char **os_release;
+ char **sysext_release;
+ char **confext_release;
+
+ bool metadata_valid:1;
+ bool discoverable:1; /* true if we know for sure that image_find() would find the image given just the short name */
+
+ void *userdata;
+} Image;
+
+Image *image_unref(Image *i);
+Image *image_ref(Image *i);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Image*, image_unref);
+
+int image_find(ImageClass class, const char *root, const char *name, Image **ret);
+int image_from_path(const char *path, Image **ret);
+int image_find_harder(ImageClass class, const char *root, const char *name_or_path, Image **ret);
+int image_discover(ImageClass class, const char *root, Hashmap *map);
+
+int image_remove(Image *i);
+int image_rename(Image *i, const char *new_name);
+int image_clone(Image *i, const char *new_name, bool read_only);
+int image_read_only(Image *i, bool b);
+
+const char* image_type_to_string(ImageType t) _const_;
+ImageType image_type_from_string(const char *s) _pure_;
+
+int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local);
+int image_name_lock(const char *name, int operation, LockFile *ret);
+
+int image_set_limit(Image *i, uint64_t referenced_max);
+
+int image_read_metadata(Image *i, const ImagePolicy *image_policy);
+
+bool image_in_search_path(ImageClass class, const char *root, const char *image);
+
+static inline char **image_extension_release(Image *image, ImageClass class) {
+ assert(image);
+
+ if (class == IMAGE_SYSEXT)
+ return image->sysext_release;
+ if (class == IMAGE_CONFEXT)
+ return image->confext_release;
+
+ return NULL;
+}
+
+static inline bool IMAGE_IS_HIDDEN(const struct Image *i) {
+ assert(i);
+
+ return i->name && i->name[0] == '.';
+}
+
+static inline bool IMAGE_IS_VENDOR(const struct Image *i) {
+ assert(i);
+
+ return i->path && path_startswith(i->path, "/usr");
+}
+
+static inline bool IMAGE_IS_HOST(const struct Image *i) {
+ assert(i);
+
+ if (i->name && streq(i->name, ".host"))
+ return true;
+
+ if (i->path && path_equal(i->path, "/"))
+ return true;
+
+ return false;
+}
+
+int image_to_json(const struct Image *i, JsonVariant **ret);
+
+extern const struct hash_ops image_hash_ops;
diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c
new file mode 100644
index 0000000..84cfbcd
--- /dev/null
+++ b/src/shared/dissect-image.c
@@ -0,0 +1,4069 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_VALGRIND_MEMCHECK_H
+#include <valgrind/memcheck.h>
+#endif
+
+#include <linux/dm-ioctl.h>
+#include <linux/loop.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sysexits.h>
+
+#if HAVE_OPENSSL
+#include <openssl/err.h>
+#include <openssl/pem.h>
+#include <openssl/x509.h>
+#endif
+
+#include "sd-device.h"
+#include "sd-id128.h"
+
+#include "architecture.h"
+#include "ask-password-api.h"
+#include "blkid-util.h"
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "chase.h"
+#include "conf-files.h"
+#include "constants.h"
+#include "copy.h"
+#include "cryptsetup-util.h"
+#include "device-nodes.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "discover-image.h"
+#include "dissect-image.h"
+#include "dm-util.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "extension-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "fsck-util.h"
+#include "gpt.h"
+#include "hexdecoct.h"
+#include "hostname-setup.h"
+#include "id128-util.h"
+#include "import-util.h"
+#include "io-util.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "nulstr-util.h"
+#include "openssl-util.h"
+#include "os-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "raw-clone.h"
+#include "resize-fs.h"
+#include "signal-util.h"
+#include "sparse-endian.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "udev-util.h"
+#include "user-util.h"
+#include "xattr-util.h"
+
+/* how many times to wait for the device nodes to appear */
+#define N_DEVICE_NODE_LIST_ATTEMPTS 10
+
+int dissect_fstype_ok(const char *fstype) {
+ const char *e;
+ bool b;
+
+ /* When we automatically mount file systems, be a bit conservative by default what we are willing to
+ * mount, just as an extra safety net to not mount with badly maintained legacy file system
+ * drivers. */
+
+ e = secure_getenv("SYSTEMD_DISSECT_FILE_SYSTEMS");
+ if (e) {
+ _cleanup_strv_free_ char **l = NULL;
+
+ l = strv_split(e, ":");
+ if (!l)
+ return -ENOMEM;
+
+ b = strv_contains(l, fstype);
+ } else
+ b = STR_IN_SET(fstype,
+ "btrfs",
+ "erofs",
+ "ext4",
+ "f2fs",
+ "squashfs",
+ "vfat",
+ "xfs");
+ if (b)
+ return true;
+
+ log_debug("File system type '%s' is not allowed to be mounted as result of automatic dissection.", fstype);
+ return false;
+}
+
+int probe_sector_size(int fd, uint32_t *ret) {
+
+ /* Disk images might be for 512B or for 4096 sector sizes, let's try to auto-detect that by searching
+ * for the GPT headers at the relevant byte offsets */
+
+ assert_cc(sizeof(GptHeader) == 92);
+
+ /* We expect a sector size in the range 512…4096. The GPT header is located in the second
+ * sector. Hence it could be at byte 512 at the earliest, and at byte 4096 at the latest. And we must
+ * read with granularity of the largest sector size we care about. Which means 8K. */
+ uint8_t sectors[2 * 4096];
+ uint32_t found = 0;
+ ssize_t n;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ n = pread(fd, sectors, sizeof(sectors), 0);
+ if (n < 0)
+ return -errno;
+ if (n != sizeof(sectors)) /* too short? */
+ goto not_found;
+
+ /* Let's see if we find the GPT partition header with various expected sector sizes */
+ for (uint32_t sz = 512; sz <= 4096; sz <<= 1) {
+ const GptHeader *p;
+
+ assert(sizeof(sectors) >= sz * 2);
+ p = (const GptHeader*) (sectors + sz);
+
+ if (!gpt_header_has_signature(p))
+ continue;
+
+ if (found != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ),
+ "Detected valid partition table at offsets matching multiple sector sizes, refusing.");
+
+ found = sz;
+ }
+
+ if (found != 0) {
+ log_debug("Determined sector size %" PRIu32 " based on discovered partition table.", found);
+ *ret = found;
+ return 1; /* indicate we *did* find it */
+ }
+
+not_found:
+ log_debug("Couldn't find any partition table to derive sector size of.");
+ *ret = 512; /* pick the traditional default */
+ return 0; /* indicate we didn't find it */
+}
+
+int probe_sector_size_prefer_ioctl(int fd, uint32_t *ret) {
+ struct stat st;
+
+ assert(fd >= 0);
+ assert(ret);
+
+ /* Just like probe_sector_size(), but if we are looking at a block device, will use the already
+ * configured sector size rather than probing by contents */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (S_ISBLK(st.st_mode))
+ return blockdev_get_sector_size(fd, ret);
+
+ return probe_sector_size(fd, ret);
+}
+
+int probe_filesystem_full(
+ int fd,
+ const char *path,
+ uint64_t offset,
+ uint64_t size,
+ char **ret_fstype) {
+
+ /* Try to find device content type and return it in *ret_fstype. If nothing is found,
+ * 0/NULL will be returned. -EUCLEAN will be returned for ambiguous results, and a
+ * different error otherwise. */
+
+#if HAVE_BLKID
+ _cleanup_(blkid_free_probep) blkid_probe b = NULL;
+ _cleanup_free_ char *path_by_fd = NULL;
+ _cleanup_close_ int fd_close = -EBADF;
+ const char *fstype;
+ int r;
+
+ assert(fd >= 0 || path);
+ assert(ret_fstype);
+
+ if (fd < 0) {
+ fd_close = open(path, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
+ if (fd_close < 0)
+ return -errno;
+
+ fd = fd_close;
+ }
+
+ if (!path) {
+ r = fd_get_path(fd, &path_by_fd);
+ if (r < 0)
+ return r;
+
+ path = path_by_fd;
+ }
+
+ if (size == 0) /* empty size? nothing found! */
+ goto not_found;
+
+ b = blkid_new_probe();
+ if (!b)
+ return -ENOMEM;
+
+ /* The Linux kernel maintains separate block device caches for main ("whole") and partition block
+ * devices, which means making a change to one might not be reflected immediately when reading via
+ * the other. That's massively confusing when mixing accesses to such devices. Let's address this in
+ * a limited way: when probing a file system that is not at the beginning of the block device we
+ * apparently probe a partition via the main block device, and in that case let's first flush the
+ * main block device cache, so that we get the data that the per-partition block device last
+ * sync'ed on.
+ *
+ * This only works under the assumption that any tools that write to the partition block devices
+ * issue an syncfs()/fsync() on the device after making changes. Typically file system formatting
+ * tools that write a superblock onto a partition block device do that, however. */
+ if (offset != 0)
+ if (ioctl(fd, BLKFLSBUF, 0) < 0)
+ log_debug_errno(errno, "Failed to flush block device cache, ignoring: %m");
+
+ errno = 0;
+ r = blkid_probe_set_device(
+ b,
+ fd,
+ offset,
+ size == UINT64_MAX ? 0 : size); /* when blkid sees size=0 it understands "everything". We prefer using UINT64_MAX for that */
+ if (r != 0)
+ return errno_or_else(ENOMEM);
+
+ blkid_probe_enable_superblocks(b, 1);
+ blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
+
+ errno = 0;
+ r = blkid_do_safeprobe(b);
+ if (r == _BLKID_SAFEPROBE_NOT_FOUND)
+ goto not_found;
+ if (r == _BLKID_SAFEPROBE_AMBIGUOUS)
+ return log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
+ "Results ambiguous for partition %s", path);
+ if (r == _BLKID_SAFEPROBE_ERROR)
+ return log_debug_errno(errno_or_else(EIO), "Failed to probe partition %s: %m", path);
+
+ assert(r == _BLKID_SAFEPROBE_FOUND);
+
+ (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
+
+ if (fstype) {
+ char *t;
+
+ log_debug("Probed fstype '%s' on partition %s.", fstype, path);
+
+ t = strdup(fstype);
+ if (!t)
+ return -ENOMEM;
+
+ *ret_fstype = t;
+ return 1;
+ }
+
+not_found:
+ log_debug("No type detected on partition %s", path);
+ *ret_fstype = NULL;
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+#if HAVE_BLKID
+static int image_policy_may_use(
+ const ImagePolicy *policy,
+ PartitionDesignator designator) {
+
+ PartitionPolicyFlags f;
+
+ /* For each partition we find in the partition table do a first check if it may exist at all given
+ * the policy, or if it shall be ignored. */
+
+ f = image_policy_get_exhaustively(policy, designator);
+ if (f < 0)
+ return f;
+
+ if ((f & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_ABSENT)
+ /* only flag set in policy is "absent"? then this partition may not exist at all */
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ERFKILL),
+ "Partition of designator '%s' exists, but not allowed by policy, refusing.",
+ partition_designator_to_string(designator));
+ if ((f & _PARTITION_POLICY_USE_MASK & ~PARTITION_POLICY_ABSENT) == PARTITION_POLICY_UNUSED) {
+ /* only "unused" or "unused" + "absent" are set? then don't use it */
+ log_debug("Partition of designator '%s' exists, and policy dictates to ignore it, doing so.",
+ partition_designator_to_string(designator));
+ return false; /* ignore! */
+ }
+
+ return true; /* use! */
+}
+
+static int image_policy_check_protection(
+ const ImagePolicy *policy,
+ PartitionDesignator designator,
+ PartitionPolicyFlags found_flags) {
+
+ PartitionPolicyFlags policy_flags;
+
+ /* Checks if the flags in the policy for the designated partition overlap the flags of what we found */
+
+ if (found_flags < 0)
+ return found_flags;
+
+ policy_flags = image_policy_get_exhaustively(policy, designator);
+ if (policy_flags < 0)
+ return policy_flags;
+
+ if ((found_flags & policy_flags) == 0) {
+ _cleanup_free_ char *found_flags_string = NULL, *policy_flags_string = NULL;
+
+ (void) partition_policy_flags_to_string(found_flags, /* simplify= */ true, &found_flags_string);
+ (void) partition_policy_flags_to_string(policy_flags, /* simplify= */ true, &policy_flags_string);
+
+ return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s discovered with policy '%s' but '%s' was required, refusing.",
+ partition_designator_to_string(designator),
+ strnull(found_flags_string), strnull(policy_flags_string));
+ }
+
+ return 0;
+}
+
+static int image_policy_check_partition_flags(
+ const ImagePolicy *policy,
+ PartitionDesignator designator,
+ uint64_t gpt_flags) {
+
+ PartitionPolicyFlags policy_flags;
+ bool b;
+
+ /* Checks if the partition flags in the policy match reality */
+
+ policy_flags = image_policy_get_exhaustively(policy, designator);
+ if (policy_flags < 0)
+ return policy_flags;
+
+ b = FLAGS_SET(gpt_flags, SD_GPT_FLAG_READ_ONLY);
+ if ((policy_flags & _PARTITION_POLICY_READ_ONLY_MASK) == (b ? PARTITION_POLICY_READ_ONLY_OFF : PARTITION_POLICY_READ_ONLY_ON))
+ return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s has 'read-only' flag incorrectly set (must be %s, is %s), refusing.",
+ partition_designator_to_string(designator),
+ one_zero(!b), one_zero(b));
+
+ b = FLAGS_SET(gpt_flags, SD_GPT_FLAG_GROWFS);
+ if ((policy_flags & _PARTITION_POLICY_GROWFS_MASK) == (b ? PARTITION_POLICY_GROWFS_OFF : PARTITION_POLICY_GROWFS_ON))
+ return log_debug_errno(SYNTHETIC_ERRNO(ERFKILL), "Partition %s has 'growfs' flag incorrectly set (must be %s, is %s), refusing.",
+ partition_designator_to_string(designator),
+ one_zero(!b), one_zero(b));
+
+ return 0;
+}
+
+static int dissected_image_probe_filesystems(
+ DissectedImage *m,
+ int fd,
+ const ImagePolicy *policy) {
+
+ int r;
+
+ assert(m);
+
+ /* Fill in file system types if we don't know them yet. */
+
+ for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) {
+ DissectedPartition *p = m->partitions + i;
+ PartitionPolicyFlags found_flags;
+
+ if (!p->found)
+ continue;
+
+ if (!p->fstype) {
+ /* If we have an fd referring to the partition block device, use that. Otherwise go
+ * via the whole block device or backing regular file, and read via offset. */
+ if (p->mount_node_fd >= 0)
+ r = probe_filesystem_full(p->mount_node_fd, p->node, 0, UINT64_MAX, &p->fstype);
+ else
+ r = probe_filesystem_full(fd, p->node, p->offset, p->size, &p->fstype);
+ if (r < 0)
+ return r;
+ }
+
+ if (streq_ptr(p->fstype, "crypto_LUKS")) {
+ m->encrypted = true;
+ found_flags = PARTITION_POLICY_ENCRYPTED; /* found this one, and its definitely encrypted */
+ } else
+ /* found it, but it's definitely not encrypted, hence mask the encrypted flag, but
+ * set all other ways that indicate "present". */
+ found_flags = PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED;
+
+ if (p->fstype && fstype_is_ro(p->fstype))
+ p->rw = false;
+
+ if (!p->rw)
+ p->growfs = false;
+
+ /* We might have learnt more about the file system now (i.e. whether it is encrypted or not),
+ * hence we need to validate this against policy again, to see if the policy still matches
+ * with this new information. Note that image_policy_check_protection() will check for
+ * overlap between what's allowed in the policy and what we pass as 'found_policy' here. In
+ * the unencrypted case we thus might pass an overly unspecific mask here (i.e. unprotected
+ * OR verity OR signed), but that's fine since the earlier policy check already checked more
+ * specific which of those three cases where OK. Keep in mind that this function here only
+ * looks at specific partitions (and thus can only deduce encryption or not) but not the
+ * overall partition table (and thus cannot deduce verity or not). The earlier dissection
+ * checks already did the relevant checks that look at the whole partition table, and
+ * enforced policy there as needed. */
+ r = image_policy_check_protection(policy, i, found_flags);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static void check_partition_flags(
+ const char *node,
+ unsigned long long pflags,
+ unsigned long long supported) {
+
+ assert(node);
+
+ /* Mask away all flags supported by this partition's type and the three flags the UEFI spec defines generically */
+ pflags &= ~(supported |
+ SD_GPT_FLAG_REQUIRED_PARTITION |
+ SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL |
+ SD_GPT_FLAG_LEGACY_BIOS_BOOTABLE);
+
+ if (pflags == 0)
+ return;
+
+ /* If there are other bits set, then log about it, to make things discoverable */
+ for (unsigned i = 0; i < sizeof(pflags) * 8; i++) {
+ unsigned long long bit = 1ULL << i;
+ if (!FLAGS_SET(pflags, bit))
+ continue;
+
+ log_debug("Unexpected partition flag %llu set on %s!", bit, node);
+ }
+}
+
+static int dissected_image_new(const char *path, DissectedImage **ret) {
+ _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
+ _cleanup_free_ char *name = NULL;
+ int r;
+
+ assert(ret);
+
+ if (path) {
+ _cleanup_free_ char *filename = NULL;
+
+ r = path_extract_filename(path, &filename);
+ if (r < 0)
+ return r;
+
+ r = raw_strip_suffixes(filename, &name);
+ if (r < 0)
+ return r;
+
+ if (!image_name_is_valid(name)) {
+ log_debug("Image name %s is not valid, ignoring.", strna(name));
+ name = mfree(name);
+ }
+ }
+
+ m = new(DissectedImage, 1);
+ if (!m)
+ return -ENOMEM;
+
+ *m = (DissectedImage) {
+ .has_init_system = -1,
+ .image_name = TAKE_PTR(name),
+ };
+
+ for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++)
+ m->partitions[i] = DISSECTED_PARTITION_NULL;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+#endif
+
+static void dissected_partition_done(DissectedPartition *p) {
+ assert(p);
+
+ free(p->fstype);
+ free(p->node);
+ free(p->label);
+ free(p->decrypted_fstype);
+ free(p->decrypted_node);
+ free(p->mount_options);
+ safe_close(p->mount_node_fd);
+ safe_close(p->fsmount_fd);
+
+ *p = DISSECTED_PARTITION_NULL;
+}
+
+#if HAVE_BLKID
+static int make_partition_devname(
+ const char *whole_devname,
+ uint64_t diskseq,
+ int nr,
+ DissectImageFlags flags,
+ char **ret) {
+
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(whole_devname);
+ assert(nr != 0); /* zero is not a valid partition nr */
+ assert(ret);
+
+ if (!FLAGS_SET(flags, DISSECT_IMAGE_DISKSEQ_DEVNODE) || diskseq == 0) {
+
+ /* Given a whole block device node name (e.g. /dev/sda or /dev/loop7) generate a partition
+ * device name (e.g. /dev/sda7 or /dev/loop7p5). The rule the kernel uses is simple: if whole
+ * block device node name ends in a digit, then suffix a 'p', followed by the partition
+ * number. Otherwise, just suffix the partition number without any 'p'. */
+
+ if (nr < 0) { /* whole disk? */
+ s = strdup(whole_devname);
+ if (!s)
+ return -ENOMEM;
+ } else {
+ size_t l = strlen(whole_devname);
+ if (l < 1) /* underflow check for the subtraction below */
+ return -EINVAL;
+
+ bool need_p = ascii_isdigit(whole_devname[l-1]); /* Last char a digit? */
+
+ if (asprintf(&s, "%s%s%i", whole_devname, need_p ? "p" : "", nr) < 0)
+ return -ENOMEM;
+ }
+ } else {
+ if (nr < 0) /* whole disk? */
+ r = asprintf(&s, "/dev/disk/by-diskseq/%" PRIu64, diskseq);
+ else
+ r = asprintf(&s, "/dev/disk/by-diskseq/%" PRIu64 "-part%i", diskseq, nr);
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+static int open_partition(
+ const char *node,
+ bool is_partition,
+ const LoopDevice *loop) {
+
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ dev_t devnum;
+ int r;
+
+ assert(node);
+ assert(loop);
+
+ fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0)
+ return -errno;
+
+ /* Check if the block device is a child of (or equivalent to) the originally provided one. */
+ r = block_device_new_from_fd(fd, is_partition ? BLOCK_DEVICE_LOOKUP_WHOLE_DISK : 0, &dev);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_devnum(dev, &devnum);
+ if (r < 0)
+ return r;
+
+ if (loop->devno != devnum)
+ return -ENXIO;
+
+ /* Also check diskseq. */
+ if (loop->diskseq != 0) {
+ uint64_t diskseq;
+
+ r = fd_get_diskseq(fd, &diskseq);
+ if (r < 0)
+ return r;
+
+ if (loop->diskseq != diskseq)
+ return -ENXIO;
+ }
+
+ log_debug("Opened %s (fd=%i, whole_block_devnum=" DEVNUM_FORMAT_STR ", diskseq=%" PRIu64 ").",
+ node, fd, DEVNUM_FORMAT_VAL(loop->devno), loop->diskseq);
+ return TAKE_FD(fd);
+}
+
+static int compare_arch(Architecture a, Architecture b) {
+ if (a == b)
+ return 0;
+
+ if (a == native_architecture())
+ return 1;
+
+ if (b == native_architecture())
+ return -1;
+
+#ifdef ARCHITECTURE_SECONDARY
+ if (a == ARCHITECTURE_SECONDARY)
+ return 1;
+
+ if (b == ARCHITECTURE_SECONDARY)
+ return -1;
+#endif
+
+ return 0;
+}
+
+static int dissect_image(
+ DissectedImage *m,
+ int fd,
+ const char *devname,
+ const VeritySettings *verity,
+ const MountOptions *mount_options,
+ const ImagePolicy *policy,
+ DissectImageFlags flags) {
+
+ sd_id128_t root_uuid = SD_ID128_NULL, root_verity_uuid = SD_ID128_NULL;
+ sd_id128_t usr_uuid = SD_ID128_NULL, usr_verity_uuid = SD_ID128_NULL;
+ bool is_gpt, is_mbr, multiple_generic = false,
+ generic_rw = false, /* initialize to appease gcc */
+ generic_growfs = false;
+ _cleanup_(blkid_free_probep) blkid_probe b = NULL;
+ _cleanup_free_ char *generic_node = NULL;
+ sd_id128_t generic_uuid = SD_ID128_NULL;
+ const char *pttype = NULL, *sptuuid = NULL;
+ blkid_partlist pl;
+ int r, generic_nr = -1, n_partitions;
+
+ assert(m);
+ assert(fd >= 0);
+ assert(devname);
+ assert(!verity || verity->designator < 0 || IN_SET(verity->designator, PARTITION_ROOT, PARTITION_USR));
+ assert(!verity || verity->root_hash || verity->root_hash_size == 0);
+ assert(!verity || verity->root_hash_sig || verity->root_hash_sig_size == 0);
+ assert(!verity || (verity->root_hash || !verity->root_hash_sig));
+ assert(!((flags & DISSECT_IMAGE_GPT_ONLY) && (flags & DISSECT_IMAGE_NO_PARTITION_TABLE)));
+ assert(m->sector_size > 0);
+
+ /* Probes a disk image, and returns information about what it found in *ret.
+ *
+ * Returns -ENOPKG if no suitable partition table or file system could be found.
+ * Returns -EADDRNOTAVAIL if a root hash was specified but no matching root/verity partitions found.
+ * Returns -ENXIO if we couldn't find any partition suitable as root or /usr partition
+ * Returns -ENOTUNIQ if we only found multiple generic partitions and thus don't know what to do with that
+ * Returns -ERFKILL if image doesn't match image policy
+ * Returns -EBADR if verity data was provided externally for an image that has a GPT partition table (i.e. is not just a naked fs)
+ * Returns -EPROTONOSUPPORT if DISSECT_IMAGE_ADD_PARTITION_DEVICES is set but the block device does not have partition logic enabled
+ * Returns -ENOMSG if we didn't find a single usable partition (and DISSECT_IMAGE_REFUSE_EMPTY is set) */
+
+ uint64_t diskseq = m->loop ? m->loop->diskseq : 0;
+
+ if (verity && verity->root_hash) {
+ sd_id128_t fsuuid, vuuid;
+
+ /* If a root hash is supplied, then we use the root partition that has a UUID that match the
+ * first 128-bit of the root hash. And we use the verity partition that has a UUID that match
+ * the final 128-bit. */
+
+ if (verity->root_hash_size < sizeof(sd_id128_t))
+ return -EINVAL;
+
+ memcpy(&fsuuid, verity->root_hash, sizeof(sd_id128_t));
+ memcpy(&vuuid, (const uint8_t*) verity->root_hash + verity->root_hash_size - sizeof(sd_id128_t), sizeof(sd_id128_t));
+
+ if (sd_id128_is_null(fsuuid))
+ return -EINVAL;
+ if (sd_id128_is_null(vuuid))
+ return -EINVAL;
+
+ /* If the verity data declares it's for the /usr partition, then search for that, in all
+ * other cases assume it's for the root partition. */
+ if (verity->designator == PARTITION_USR) {
+ usr_uuid = fsuuid;
+ usr_verity_uuid = vuuid;
+ } else {
+ root_uuid = fsuuid;
+ root_verity_uuid = vuuid;
+ }
+ }
+
+ b = blkid_new_probe();
+ if (!b)
+ return -ENOMEM;
+
+ errno = 0;
+ r = blkid_probe_set_device(b, fd, 0, 0);
+ if (r != 0)
+ return errno_or_else(ENOMEM);
+
+ errno = 0;
+ r = blkid_probe_set_sectorsize(b, m->sector_size);
+ if (r != 0)
+ return errno_or_else(EIO);
+
+ if ((flags & DISSECT_IMAGE_GPT_ONLY) == 0) {
+ /* Look for file system superblocks, unless we only shall look for GPT partition tables */
+ blkid_probe_enable_superblocks(b, 1);
+ blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE|BLKID_SUBLKS_USAGE|BLKID_SUBLKS_UUID);
+ }
+
+ blkid_probe_enable_partitions(b, 1);
+ blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
+
+ errno = 0;
+ r = blkid_do_safeprobe(b);
+ if (r == _BLKID_SAFEPROBE_ERROR)
+ return errno_or_else(EIO);
+ if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOPKG), "Failed to identify any partition table.");
+
+ assert(r == _BLKID_SAFEPROBE_FOUND);
+
+ if ((!(flags & DISSECT_IMAGE_GPT_ONLY) &&
+ (flags & DISSECT_IMAGE_GENERIC_ROOT)) ||
+ (flags & DISSECT_IMAGE_NO_PARTITION_TABLE)) {
+ const char *usage = NULL;
+
+ /* If flags permit this, also allow using non-partitioned single-filesystem images */
+
+ (void) blkid_probe_lookup_value(b, "USAGE", &usage, NULL);
+ if (STRPTR_IN_SET(usage, "filesystem", "crypto")) {
+ _cleanup_free_ char *t = NULL, *n = NULL, *o = NULL;
+ const char *fstype = NULL, *options = NULL, *suuid = NULL;
+ _cleanup_close_ int mount_node_fd = -EBADF;
+ sd_id128_t uuid = SD_ID128_NULL;
+ PartitionPolicyFlags found_flags;
+ bool encrypted;
+
+ /* OK, we have found a file system, that's our root partition then. */
+
+ r = image_policy_may_use(policy, PARTITION_ROOT);
+ if (r < 0)
+ return r;
+ if (r == 0) /* policy says ignore this, so we ignore it */
+ return -ENOPKG;
+
+ (void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
+ (void) blkid_probe_lookup_value(b, "UUID", &suuid, NULL);
+
+ encrypted = streq_ptr(fstype, "crypto_LUKS");
+
+ if (verity_settings_data_covers(verity, PARTITION_ROOT))
+ found_flags = verity->root_hash_sig ? PARTITION_POLICY_SIGNED : PARTITION_POLICY_VERITY;
+ else
+ found_flags = encrypted ? PARTITION_POLICY_ENCRYPTED : PARTITION_POLICY_UNPROTECTED;
+
+ r = image_policy_check_protection(policy, PARTITION_ROOT, found_flags);
+ if (r < 0)
+ return r;
+
+ r = image_policy_check_partition_flags(policy, PARTITION_ROOT, 0); /* we have no gpt partition flags, hence check against all bits off */
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) {
+ mount_node_fd = open_partition(devname, /* is_partition = */ false, m->loop);
+ if (mount_node_fd < 0)
+ return mount_node_fd;
+ }
+
+ if (fstype) {
+ t = strdup(fstype);
+ if (!t)
+ return -ENOMEM;
+ }
+
+ if (suuid) {
+ /* blkid will return FAT's serial number as UUID, hence it is quite possible
+ * that parsing this will fail. We'll ignore the ID, since it's just too
+ * short to be useful as tru identifier. */
+ r = sd_id128_from_string(suuid, &uuid);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse file system UUID '%s', ignoring: %m", suuid);
+ }
+
+ r = make_partition_devname(devname, diskseq, -1, flags, &n);
+ if (r < 0)
+ return r;
+
+ m->single_file_system = true;
+ m->encrypted = encrypted;
+
+ m->has_verity = verity && verity->data_path;
+ m->verity_ready = verity_settings_data_covers(verity, PARTITION_ROOT);
+
+ m->has_verity_sig = false; /* signature not embedded, must be specified */
+ m->verity_sig_ready = m->verity_ready && verity->root_hash_sig;
+
+ m->image_uuid = uuid;
+
+ options = mount_options_from_designator(mount_options, PARTITION_ROOT);
+ if (options) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ m->partitions[PARTITION_ROOT] = (DissectedPartition) {
+ .found = true,
+ .rw = !m->verity_ready && !fstype_is_ro(fstype),
+ .partno = -1,
+ .architecture = _ARCHITECTURE_INVALID,
+ .fstype = TAKE_PTR(t),
+ .node = TAKE_PTR(n),
+ .mount_options = TAKE_PTR(o),
+ .mount_node_fd = TAKE_FD(mount_node_fd),
+ .offset = 0,
+ .size = UINT64_MAX,
+ .fsmount_fd = -EBADF,
+ };
+
+ return 0;
+ }
+ }
+
+ (void) blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
+ if (!pttype)
+ return -ENOPKG;
+
+ is_gpt = streq_ptr(pttype, "gpt");
+ is_mbr = streq_ptr(pttype, "dos");
+
+ if (!is_gpt && ((flags & DISSECT_IMAGE_GPT_ONLY) || !is_mbr))
+ return -ENOPKG;
+
+ /* We support external verity data partitions only if the image has no partition table */
+ if (verity && verity->data_path)
+ return -EBADR;
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_ADD_PARTITION_DEVICES)) {
+ /* Safety check: refuse block devices that carry a partition table but for which the kernel doesn't
+ * do partition scanning. */
+ r = blockdev_partscan_enabled(fd);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EPROTONOSUPPORT;
+ }
+
+ (void) blkid_probe_lookup_value(b, "PTUUID", &sptuuid, NULL);
+ if (sptuuid) {
+ r = sd_id128_from_string(sptuuid, &m->image_uuid);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse partition table UUID '%s', ignoring: %m", sptuuid);
+ }
+
+ errno = 0;
+ pl = blkid_probe_get_partitions(b);
+ if (!pl)
+ return errno_or_else(ENOMEM);
+
+ errno = 0;
+ n_partitions = blkid_partlist_numof_partitions(pl);
+ if (n_partitions < 0)
+ return errno_or_else(EIO);
+
+ for (int i = 0; i < n_partitions; i++) {
+ _cleanup_free_ char *node = NULL;
+ unsigned long long pflags;
+ blkid_loff_t start, size;
+ blkid_partition pp;
+ int nr;
+
+ errno = 0;
+ pp = blkid_partlist_get_partition(pl, i);
+ if (!pp)
+ return errno_or_else(EIO);
+
+ pflags = blkid_partition_get_flags(pp);
+
+ errno = 0;
+ nr = blkid_partition_get_partno(pp);
+ if (nr < 0)
+ return errno_or_else(EIO);
+
+ errno = 0;
+ start = blkid_partition_get_start(pp);
+ if (start < 0)
+ return errno_or_else(EIO);
+
+ assert((uint64_t) start < UINT64_MAX/512);
+
+ errno = 0;
+ size = blkid_partition_get_size(pp);
+ if (size < 0)
+ return errno_or_else(EIO);
+
+ assert((uint64_t) size < UINT64_MAX/512);
+
+ /* While probing we need the non-diskseq device node name to access the thing, hence mask off
+ * DISSECT_IMAGE_DISKSEQ_DEVNODE. */
+ r = make_partition_devname(devname, diskseq, nr, flags & ~DISSECT_IMAGE_DISKSEQ_DEVNODE, &node);
+ if (r < 0)
+ return r;
+
+ /* So here's the thing: after the main ("whole") block device popped up it might take a while
+ * before the kernel fully probed the partition table. Waiting for that to finish is icky in
+ * userspace. So here's what we do instead. We issue the BLKPG_ADD_PARTITION ioctl to add the
+ * partition ourselves, racing against the kernel. Good thing is: if this call fails with
+ * EBUSY then the kernel was quicker than us, and that's totally OK, the outcome is good for
+ * us: the device node will exist. If OTOH our call was successful we won the race. Which is
+ * also good as the outcome is the same: the partition block device exists, and we can use
+ * it.
+ *
+ * Kernel returns EBUSY if there's already a partition by that number or an overlapping
+ * partition already existent. */
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_ADD_PARTITION_DEVICES)) {
+ r = block_device_add_partition(fd, node, nr, (uint64_t) start * 512, (uint64_t) size * 512);
+ if (r < 0) {
+ if (r != -EBUSY)
+ return log_debug_errno(r, "BLKPG_ADD_PARTITION failed: %m");
+
+ log_debug_errno(r, "Kernel was quicker than us in adding partition %i.", nr);
+ } else
+ log_debug("We were quicker than kernel in adding partition %i.", nr);
+ }
+
+ if (is_gpt) {
+ const char *fstype = NULL, *label;
+ sd_id128_t type_id, id;
+ GptPartitionType type;
+ bool rw = true, growfs = false;
+
+ r = blkid_partition_get_uuid_id128(pp, &id);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read partition UUID, ignoring: %m");
+ continue;
+ }
+
+ r = blkid_partition_get_type_id128(pp, &type_id);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read partition type UUID, ignoring: %m");
+ continue;
+ }
+
+ type = gpt_partition_type_from_uuid(type_id);
+
+ label = blkid_partition_get_name(pp); /* libblkid returns NULL here if empty */
+
+ if (IN_SET(type.designator,
+ PARTITION_HOME,
+ PARTITION_SRV,
+ PARTITION_XBOOTLDR,
+ PARTITION_TMP)) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ rw = !(pflags & SD_GPT_FLAG_READ_ONLY);
+ growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS);
+
+ } else if (type.designator == PARTITION_ESP) {
+
+ /* Note that we don't check the SD_GPT_FLAG_NO_AUTO flag for the ESP, as it is
+ * not defined there. We instead check the SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL, as
+ * recommended by the UEFI spec (See "12.3.3 Number and Location of System
+ * Partitions"). */
+
+ if (pflags & SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL)
+ continue;
+
+ fstype = "vfat";
+
+ } else if (type.designator == PARTITION_ROOT) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ /* If a root ID is specified, ignore everything but the root id */
+ if (!sd_id128_is_null(root_uuid) && !sd_id128_equal(root_uuid, id))
+ continue;
+
+ rw = !(pflags & SD_GPT_FLAG_READ_ONLY);
+ growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS);
+
+ } else if (type.designator == PARTITION_ROOT_VERITY) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ m->has_verity = true;
+
+ /* If no verity configuration is specified, then don't do verity */
+ if (!verity)
+ continue;
+ if (verity->designator >= 0 && verity->designator != PARTITION_ROOT)
+ continue;
+
+ /* If root hash is specified, then ignore everything but the root id */
+ if (!sd_id128_is_null(root_verity_uuid) && !sd_id128_equal(root_verity_uuid, id))
+ continue;
+
+ fstype = "DM_verity_hash";
+ rw = false;
+
+ } else if (type.designator == PARTITION_ROOT_VERITY_SIG) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ m->has_verity_sig = true;
+
+ if (!verity)
+ continue;
+ if (verity->designator >= 0 && verity->designator != PARTITION_ROOT)
+ continue;
+
+ fstype = "verity_hash_signature";
+ rw = false;
+
+ } else if (type.designator == PARTITION_USR) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ /* If a usr ID is specified, ignore everything but the usr id */
+ if (!sd_id128_is_null(usr_uuid) && !sd_id128_equal(usr_uuid, id))
+ continue;
+
+ rw = !(pflags & SD_GPT_FLAG_READ_ONLY);
+ growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS);
+
+ } else if (type.designator == PARTITION_USR_VERITY) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ m->has_verity = true;
+
+ if (!verity)
+ continue;
+ if (verity->designator >= 0 && verity->designator != PARTITION_USR)
+ continue;
+
+ /* If usr hash is specified, then ignore everything but the usr id */
+ if (!sd_id128_is_null(usr_verity_uuid) && !sd_id128_equal(usr_verity_uuid, id))
+ continue;
+
+ fstype = "DM_verity_hash";
+ rw = false;
+
+ } else if (type.designator == PARTITION_USR_VERITY_SIG) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ m->has_verity_sig = true;
+
+ if (!verity)
+ continue;
+ if (verity->designator >= 0 && verity->designator != PARTITION_USR)
+ continue;
+
+ fstype = "verity_hash_signature";
+ rw = false;
+
+ } else if (type.designator == PARTITION_SWAP) {
+
+ check_partition_flags(node, pflags, SD_GPT_FLAG_NO_AUTO);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ /* Note: we don't set fstype = "swap" here, because we still need to probe if
+ * it might be encrypted (i.e. fstype "crypt_LUKS") or unencrypted
+ * (i.e. fstype "swap"), and the only way to figure that out is via fstype
+ * probing. */
+
+ /* We don't have a designator for SD_GPT_LINUX_GENERIC so check the UUID instead. */
+ } else if (sd_id128_equal(type.uuid, SD_GPT_LINUX_GENERIC)) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ if (generic_node)
+ multiple_generic = true;
+ else {
+ generic_nr = nr;
+ generic_rw = !(pflags & SD_GPT_FLAG_READ_ONLY);
+ generic_growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS);
+ generic_uuid = id;
+ generic_node = TAKE_PTR(node);
+ }
+
+ } else if (type.designator == PARTITION_VAR) {
+
+ check_partition_flags(node, pflags,
+ SD_GPT_FLAG_NO_AUTO | SD_GPT_FLAG_READ_ONLY | SD_GPT_FLAG_GROWFS);
+
+ if (pflags & SD_GPT_FLAG_NO_AUTO)
+ continue;
+
+ if (!FLAGS_SET(flags, DISSECT_IMAGE_RELAX_VAR_CHECK)) {
+ sd_id128_t var_uuid;
+
+ /* For /var we insist that the uuid of the partition matches the
+ * HMAC-SHA256 of the /var GPT partition type uuid, keyed by machine
+ * ID. Why? Unlike the other partitions /var is inherently
+ * installation specific, hence we need to be careful not to mount it
+ * in the wrong installation. By hashing the partition UUID from
+ * /etc/machine-id we can securely bind the partition to the
+ * installation. */
+
+ r = sd_id128_get_machine_app_specific(SD_GPT_VAR, &var_uuid);
+ if (r < 0)
+ return r;
+
+ if (!sd_id128_equal(var_uuid, id)) {
+ log_debug("Found a /var/ partition, but its UUID didn't match our expectations "
+ "(found: " SD_ID128_UUID_FORMAT_STR ", expected: " SD_ID128_UUID_FORMAT_STR "), ignoring.",
+ SD_ID128_FORMAT_VAL(id), SD_ID128_FORMAT_VAL(var_uuid));
+ continue;
+ }
+ }
+
+ rw = !(pflags & SD_GPT_FLAG_READ_ONLY);
+ growfs = FLAGS_SET(pflags, SD_GPT_FLAG_GROWFS);
+ }
+
+ if (type.designator != _PARTITION_DESIGNATOR_INVALID) {
+ _cleanup_free_ char *t = NULL, *o = NULL, *l = NULL, *n = NULL;
+ _cleanup_close_ int mount_node_fd = -EBADF;
+ const char *options = NULL;
+
+ r = image_policy_may_use(policy, type.designator);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Policy says: ignore; Remember this fact, so that we later can distinguish between "found but ignored" and "not found at all" */
+
+ if (!m->partitions[type.designator].found)
+ m->partitions[type.designator].ignored = true;
+
+ continue;
+ }
+
+ if (m->partitions[type.designator].found) {
+ int c;
+
+ /* For most partition types the first one we see wins. Except for the
+ * rootfs and /usr, where we do a version compare of the label, and
+ * let the newest version win. This permits a simple A/B versioning
+ * scheme in OS images. */
+
+ c = compare_arch(type.arch, m->partitions[type.designator].architecture);
+ if (c < 0) /* the arch we already found is better than the one we found now */
+ continue;
+ if (c == 0 && /* same arch? then go by version in label */
+ (!partition_designator_is_versioned(type.designator) ||
+ strverscmp_improved(label, m->partitions[type.designator].label) <= 0))
+ continue;
+
+ dissected_partition_done(m->partitions + type.designator);
+ }
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES) &&
+ type.designator != PARTITION_SWAP) {
+ mount_node_fd = open_partition(node, /* is_partition = */ true, m->loop);
+ if (mount_node_fd < 0)
+ return mount_node_fd;
+ }
+
+ r = make_partition_devname(devname, diskseq, nr, flags, &n);
+ if (r < 0)
+ return r;
+
+ if (fstype) {
+ t = strdup(fstype);
+ if (!t)
+ return -ENOMEM;
+ }
+
+ if (label) {
+ l = strdup(label);
+ if (!l)
+ return -ENOMEM;
+ }
+
+ options = mount_options_from_designator(mount_options, type.designator);
+ if (options) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ m->partitions[type.designator] = (DissectedPartition) {
+ .found = true,
+ .partno = nr,
+ .rw = rw,
+ .growfs = growfs,
+ .architecture = type.arch,
+ .node = TAKE_PTR(n),
+ .fstype = TAKE_PTR(t),
+ .label = TAKE_PTR(l),
+ .uuid = id,
+ .mount_options = TAKE_PTR(o),
+ .mount_node_fd = TAKE_FD(mount_node_fd),
+ .offset = (uint64_t) start * 512,
+ .size = (uint64_t) size * 512,
+ .gpt_flags = pflags,
+ .fsmount_fd = -EBADF,
+ };
+ }
+
+ } else if (is_mbr) {
+
+ switch (blkid_partition_get_type(pp)) {
+
+ case 0x83: /* Linux partition */
+
+ if (pflags != 0x80) /* Bootable flag */
+ continue;
+
+ if (generic_node)
+ multiple_generic = true;
+ else {
+ generic_nr = nr;
+ generic_rw = true;
+ generic_growfs = false;
+ generic_node = TAKE_PTR(node);
+ }
+
+ break;
+
+ case 0xEA: { /* Boot Loader Spec extended $BOOT partition */
+ _cleanup_close_ int mount_node_fd = -EBADF;
+ _cleanup_free_ char *o = NULL, *n = NULL;
+ sd_id128_t id = SD_ID128_NULL;
+ const char *options = NULL;
+
+ r = image_policy_may_use(policy, PARTITION_XBOOTLDR);
+ if (r < 0)
+ return r;
+ if (r == 0) { /* policy says: ignore */
+ if (!m->partitions[PARTITION_XBOOTLDR].found)
+ m->partitions[PARTITION_XBOOTLDR].ignored = true;
+
+ continue;
+ }
+
+ /* First one wins */
+ if (m->partitions[PARTITION_XBOOTLDR].found)
+ continue;
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) {
+ mount_node_fd = open_partition(node, /* is_partition = */ true, m->loop);
+ if (mount_node_fd < 0)
+ return mount_node_fd;
+ }
+
+ (void) blkid_partition_get_uuid_id128(pp, &id);
+
+ r = make_partition_devname(devname, diskseq, nr, flags, &n);
+ if (r < 0)
+ return r;
+
+ options = mount_options_from_designator(mount_options, PARTITION_XBOOTLDR);
+ if (options) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ m->partitions[PARTITION_XBOOTLDR] = (DissectedPartition) {
+ .found = true,
+ .partno = nr,
+ .rw = true,
+ .growfs = false,
+ .architecture = _ARCHITECTURE_INVALID,
+ .node = TAKE_PTR(n),
+ .uuid = id,
+ .mount_options = TAKE_PTR(o),
+ .mount_node_fd = TAKE_FD(mount_node_fd),
+ .offset = (uint64_t) start * 512,
+ .size = (uint64_t) size * 512,
+ .fsmount_fd = -EBADF,
+ };
+
+ break;
+ }}
+ }
+ }
+
+ if (!m->partitions[PARTITION_ROOT].found &&
+ (m->partitions[PARTITION_ROOT_VERITY].found ||
+ m->partitions[PARTITION_ROOT_VERITY_SIG].found))
+ return -EADDRNOTAVAIL; /* Verity found but no matching rootfs? Something is off, refuse. */
+
+ /* Hmm, we found a signature partition but no Verity data? Something is off. */
+ if (m->partitions[PARTITION_ROOT_VERITY_SIG].found && !m->partitions[PARTITION_ROOT_VERITY].found)
+ return -EADDRNOTAVAIL;
+
+ if (!m->partitions[PARTITION_USR].found &&
+ (m->partitions[PARTITION_USR_VERITY].found ||
+ m->partitions[PARTITION_USR_VERITY_SIG].found))
+ return -EADDRNOTAVAIL; /* as above */
+
+ /* as above */
+ if (m->partitions[PARTITION_USR_VERITY_SIG].found && !m->partitions[PARTITION_USR_VERITY].found)
+ return -EADDRNOTAVAIL;
+
+ /* If root and /usr are combined then insist that the architecture matches */
+ if (m->partitions[PARTITION_ROOT].found &&
+ m->partitions[PARTITION_USR].found &&
+ (m->partitions[PARTITION_ROOT].architecture >= 0 &&
+ m->partitions[PARTITION_USR].architecture >= 0 &&
+ m->partitions[PARTITION_ROOT].architecture != m->partitions[PARTITION_USR].architecture))
+ return -EADDRNOTAVAIL;
+
+ if (!m->partitions[PARTITION_ROOT].found &&
+ !m->partitions[PARTITION_USR].found &&
+ (flags & DISSECT_IMAGE_GENERIC_ROOT) &&
+ (!verity || !verity->root_hash || verity->designator != PARTITION_USR)) {
+
+ /* OK, we found nothing usable, then check if there's a single generic partition, and use
+ * that. If the root hash was set however, then we won't fall back to a generic node, because
+ * the root hash decides. */
+
+ /* If we didn't find a properly marked root partition, but we did find a single suitable
+ * generic Linux partition, then use this as root partition, if the caller asked for it. */
+ if (multiple_generic)
+ return -ENOTUNIQ;
+
+ /* If we didn't find a generic node, then we can't fix this up either */
+ if (generic_node) {
+ r = image_policy_may_use(policy, PARTITION_ROOT);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ /* Policy says: ignore; remember that we did */
+ m->partitions[PARTITION_ROOT].ignored = true;
+ else {
+ _cleanup_close_ int mount_node_fd = -EBADF;
+ _cleanup_free_ char *o = NULL, *n = NULL;
+ const char *options;
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_PIN_PARTITION_DEVICES)) {
+ mount_node_fd = open_partition(generic_node, /* is_partition = */ true, m->loop);
+ if (mount_node_fd < 0)
+ return mount_node_fd;
+ }
+
+ r = make_partition_devname(devname, diskseq, generic_nr, flags, &n);
+ if (r < 0)
+ return r;
+
+ options = mount_options_from_designator(mount_options, PARTITION_ROOT);
+ if (options) {
+ o = strdup(options);
+ if (!o)
+ return -ENOMEM;
+ }
+
+ assert(generic_nr >= 0);
+ m->partitions[PARTITION_ROOT] = (DissectedPartition) {
+ .found = true,
+ .rw = generic_rw,
+ .growfs = generic_growfs,
+ .partno = generic_nr,
+ .architecture = _ARCHITECTURE_INVALID,
+ .node = TAKE_PTR(n),
+ .uuid = generic_uuid,
+ .mount_options = TAKE_PTR(o),
+ .mount_node_fd = TAKE_FD(mount_node_fd),
+ .offset = UINT64_MAX,
+ .size = UINT64_MAX,
+ .fsmount_fd = -EBADF,
+ };
+ }
+ }
+ }
+
+ /* Check if we have a root fs if we are told to do check. /usr alone is fine too, but only if appropriate flag for that is set too */
+ if (FLAGS_SET(flags, DISSECT_IMAGE_REQUIRE_ROOT) &&
+ !(m->partitions[PARTITION_ROOT].found || (m->partitions[PARTITION_USR].found && FLAGS_SET(flags, DISSECT_IMAGE_USR_NO_ROOT))))
+ return -ENXIO;
+
+ if (m->partitions[PARTITION_ROOT_VERITY].found) {
+ /* We only support one verity partition per image, i.e. can't do for both /usr and root fs */
+ if (m->partitions[PARTITION_USR_VERITY].found)
+ return -ENOTUNIQ;
+
+ /* We don't support verity enabled root with a split out /usr. Neither with nor without
+ * verity there. (Note that we do support verity-less root with verity-full /usr, though.) */
+ if (m->partitions[PARTITION_USR].found)
+ return -EADDRNOTAVAIL;
+ }
+
+ if (verity) {
+ /* If a verity designator is specified, then insist that the matching partition exists */
+ if (verity->designator >= 0 && !m->partitions[verity->designator].found)
+ return -EADDRNOTAVAIL;
+
+ bool have_verity_sig_partition;
+ if (verity->designator >= 0)
+ have_verity_sig_partition = m->partitions[verity->designator == PARTITION_USR ? PARTITION_USR_VERITY_SIG : PARTITION_ROOT_VERITY_SIG].found;
+ else
+ have_verity_sig_partition = m->partitions[PARTITION_USR_VERITY_SIG].found || m->partitions[PARTITION_ROOT_VERITY_SIG].found;
+
+ if (verity->root_hash) {
+ /* If we have an explicit root hash and found the partitions for it, then we are ready to use
+ * Verity, set things up for it */
+
+ if (verity->designator < 0 || verity->designator == PARTITION_ROOT) {
+ if (!m->partitions[PARTITION_ROOT_VERITY].found || !m->partitions[PARTITION_ROOT].found)
+ return -EADDRNOTAVAIL;
+
+ /* If we found a verity setup, then the root partition is necessarily read-only. */
+ m->partitions[PARTITION_ROOT].rw = false;
+ m->verity_ready = true;
+
+ } else {
+ assert(verity->designator == PARTITION_USR);
+
+ if (!m->partitions[PARTITION_USR_VERITY].found || !m->partitions[PARTITION_USR].found)
+ return -EADDRNOTAVAIL;
+
+ m->partitions[PARTITION_USR].rw = false;
+ m->verity_ready = true;
+ }
+
+ if (m->verity_ready)
+ m->verity_sig_ready = verity->root_hash_sig || have_verity_sig_partition;
+
+ } else if (have_verity_sig_partition) {
+
+ /* If we found an embedded signature partition, we are ready, too. */
+
+ m->verity_ready = m->verity_sig_ready = true;
+ if (verity->designator >= 0)
+ m->partitions[verity->designator == PARTITION_USR ? PARTITION_USR : PARTITION_ROOT].rw = false;
+ else if (m->partitions[PARTITION_USR_VERITY_SIG].found)
+ m->partitions[PARTITION_USR].rw = false;
+ else if (m->partitions[PARTITION_ROOT_VERITY_SIG].found)
+ m->partitions[PARTITION_ROOT].rw = false;
+ }
+ }
+
+ bool any = false;
+
+ /* After we discovered all partitions let's see if the verity requirements match the policy. (Note:
+ * we don't check encryption requirements here, because we haven't probed the file system yet, hence
+ * don't know if this is encrypted or not) */
+ for (PartitionDesignator di = 0; di < _PARTITION_DESIGNATOR_MAX; di++) {
+ PartitionDesignator vi, si;
+ PartitionPolicyFlags found_flags;
+
+ any = any || m->partitions[di].found;
+
+ vi = partition_verity_of(di);
+ si = partition_verity_sig_of(di);
+
+ /* Determine the verity protection level for this partition. */
+ found_flags = m->partitions[di].found ?
+ (vi >= 0 && m->partitions[vi].found ?
+ (si >= 0 && m->partitions[si].found ? PARTITION_POLICY_SIGNED : PARTITION_POLICY_VERITY) :
+ PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED) :
+ (m->partitions[di].ignored ? PARTITION_POLICY_UNUSED : PARTITION_POLICY_ABSENT);
+
+ r = image_policy_check_protection(policy, di, found_flags);
+ if (r < 0)
+ return r;
+
+ if (m->partitions[di].found) {
+ r = image_policy_check_partition_flags(policy, di, m->partitions[di].gpt_flags);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (!any && !FLAGS_SET(flags, DISSECT_IMAGE_ALLOW_EMPTY))
+ return -ENOMSG;
+
+ r = dissected_image_probe_filesystems(m, fd, policy);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+#endif
+
+int dissect_image_file(
+ const char *path,
+ const VeritySettings *verity,
+ const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
+ DissectImageFlags flags,
+ DissectedImage **ret) {
+
+#if HAVE_BLKID
+ _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(path);
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (fd < 0)
+ return -errno;
+
+ r = fd_verify_regular(fd);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_new(path, &m);
+ if (r < 0)
+ return r;
+
+ r = probe_sector_size(fd, &m->sector_size);
+ if (r < 0)
+ return r;
+
+ r = dissect_image(m, fd, path, verity, mount_options, image_policy, flags);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ *ret = TAKE_PTR(m);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+int dissect_log_error(int log_level, int r, const char *name, const VeritySettings *verity) {
+ assert(log_level >= 0 && log_level <= LOG_DEBUG);
+ assert(name);
+
+ switch (r) {
+
+ case 0 ... INT_MAX: /* success! */
+ return r;
+
+ case -EOPNOTSUPP:
+ return log_full_errno(log_level, r, "Dissecting images is not supported, compiled without blkid support.");
+
+ case -ENOPKG:
+ return log_full_errno(log_level, r, "%s: Couldn't identify a suitable partition table or file system.", name);
+
+ case -ENOMEDIUM:
+ return log_full_errno(log_level, r, "%s: The image does not pass os-release/extension-release validation.", name);
+
+ case -EADDRNOTAVAIL:
+ return log_full_errno(log_level, r, "%s: No root partition for specified root hash found.", name);
+
+ case -ENOTUNIQ:
+ return log_full_errno(log_level, r, "%s: Multiple suitable root partitions found in image.", name);
+
+ case -ENXIO:
+ return log_full_errno(log_level, r, "%s: No suitable root partition found in image.", name);
+
+ case -EPROTONOSUPPORT:
+ return log_full_errno(log_level, r, "Device '%s' is a loopback block device with partition scanning turned off, please turn it on.", name);
+
+ case -ENOTBLK:
+ return log_full_errno(log_level, r, "%s: Image is not a block device.", name);
+
+ case -EBADR:
+ return log_full_errno(log_level, r,
+ "Combining partitioned images (such as '%s') with external Verity data (such as '%s') not supported. "
+ "(Consider setting $SYSTEMD_DISSECT_VERITY_SIDECAR=0 to disable automatic discovery of external Verity data.)",
+ name, strna(verity ? verity->data_path : NULL));
+
+ case -ERFKILL:
+ return log_full_errno(log_level, r, "%s: image does not match image policy.", name);
+
+ case -ENOMSG:
+ return log_full_errno(log_level, r, "%s: no suitable partitions found.", name);
+
+ default:
+ return log_full_errno(log_level, r, "%s: cannot dissect image: %m", name);
+ }
+}
+
+int dissect_image_file_and_warn(
+ const char *path,
+ const VeritySettings *verity,
+ const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
+ DissectImageFlags flags,
+ DissectedImage **ret) {
+
+ return dissect_log_error(
+ LOG_ERR,
+ dissect_image_file(path, verity, mount_options, image_policy, flags, ret),
+ path,
+ verity);
+}
+
+DissectedImage* dissected_image_unref(DissectedImage *m) {
+ if (!m)
+ return NULL;
+
+ /* First, clear dissected partitions. */
+ for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++)
+ dissected_partition_done(m->partitions + i);
+
+ /* Second, free decrypted images. This must be after dissected_partition_done(), as freeing
+ * DecryptedImage may try to deactivate partitions. */
+ decrypted_image_unref(m->decrypted_image);
+
+ /* Third, unref LoopDevice. This must be called after the above two, as freeing LoopDevice may try to
+ * remove existing partitions on the loopback block device. */
+ loop_device_unref(m->loop);
+
+ free(m->image_name);
+ free(m->hostname);
+ strv_free(m->machine_info);
+ strv_free(m->os_release);
+ strv_free(m->initrd_release);
+ strv_free(m->confext_release);
+ strv_free(m->sysext_release);
+
+ return mfree(m);
+}
+
+static int is_loop_device(const char *path) {
+ char s[SYS_BLOCK_PATH_MAX("/../loop/")];
+ struct stat st;
+
+ assert(path);
+
+ if (stat(path, &st) < 0)
+ return -errno;
+
+ if (!S_ISBLK(st.st_mode))
+ return -ENOTBLK;
+
+ xsprintf_sys_block_path(s, "/loop/", st.st_dev);
+ if (access(s, F_OK) < 0) {
+ if (errno != ENOENT)
+ return -errno;
+
+ /* The device itself isn't a loop device, but maybe it's a partition and its parent is? */
+ xsprintf_sys_block_path(s, "/../loop/", st.st_dev);
+ if (access(s, F_OK) < 0)
+ return errno == ENOENT ? false : -errno;
+ }
+
+ return true;
+}
+
+static int run_fsck(int node_fd, const char *fstype) {
+ int r, exit_status;
+ pid_t pid;
+
+ assert(node_fd >= 0);
+ assert(fstype);
+
+ r = fsck_exists_for_fstype(fstype);
+ if (r < 0) {
+ log_debug_errno(r, "Couldn't determine whether fsck for %s exists, proceeding anyway.", fstype);
+ return 0;
+ }
+ if (r == 0) {
+ log_debug("Not checking partition %s, as fsck for %s does not exist.", FORMAT_PROC_FD_PATH(node_fd), fstype);
+ return 0;
+ }
+
+ r = safe_fork_full(
+ "(fsck)",
+ NULL,
+ &node_fd, 1, /* Leave the node fd open */
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_REARRANGE_STDIO|FORK_CLOEXEC_OFF,
+ &pid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fork off fsck: %m");
+ if (r == 0) {
+ /* Child */
+ execlp("fsck", "fsck", "-aT", FORMAT_PROC_FD_PATH(node_fd), NULL);
+ log_open();
+ log_debug_errno(errno, "Failed to execl() fsck: %m");
+ _exit(FSCK_OPERATIONAL_ERROR);
+ }
+
+ exit_status = wait_for_terminate_and_check("fsck", pid, 0);
+ if (exit_status < 0)
+ return log_debug_errno(exit_status, "Failed to fork off fsck: %m");
+
+ if ((exit_status & ~FSCK_ERROR_CORRECTED) != FSCK_SUCCESS) {
+ log_debug("fsck failed with exit status %i.", exit_status);
+
+ if ((exit_status & (FSCK_SYSTEM_SHOULD_REBOOT|FSCK_ERRORS_LEFT_UNCORRECTED)) != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), "File system is corrupted, refusing.");
+
+ log_debug("Ignoring fsck error.");
+ }
+
+ return 0;
+}
+
+static int fs_grow(const char *node_path, int mount_fd, const char *mount_path) {
+ _cleanup_close_ int _mount_fd = -EBADF, node_fd = -EBADF;
+ uint64_t size, newsize;
+ const char *id;
+ int r;
+
+ assert(node_path);
+ assert(mount_fd >= 0 || mount_path);
+
+ node_fd = open(node_path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (node_fd < 0)
+ return log_debug_errno(errno, "Failed to open node device %s: %m", node_path);
+
+ if (ioctl(node_fd, BLKGETSIZE64, &size) != 0)
+ return log_debug_errno(errno, "Failed to get block device size of %s: %m", node_path);
+
+ if (mount_fd < 0) {
+ assert(mount_path);
+
+ _mount_fd = open(mount_path, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (_mount_fd < 0)
+ return log_debug_errno(errno, "Failed to open mounted file system %s: %m", mount_path);
+
+ mount_fd = _mount_fd;
+ } else {
+ mount_fd = fd_reopen_condition(mount_fd, O_RDONLY|O_DIRECTORY|O_CLOEXEC, O_RDONLY|O_DIRECTORY|O_CLOEXEC, &_mount_fd);
+ if (mount_fd < 0)
+ return log_debug_errno(errno, "Failed to reopen mount node: %m");
+ }
+
+ id = mount_path ?: node_path;
+
+ log_debug("Resizing \"%s\" to %"PRIu64" bytes...", id, size);
+ r = resize_fs(mount_fd, size, &newsize);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to resize \"%s\" to %"PRIu64" bytes: %m", id, size);
+
+ if (newsize == size)
+ log_debug("Successfully resized \"%s\" to %s bytes.",
+ id, FORMAT_BYTES(newsize));
+ else {
+ assert(newsize < size);
+ log_debug("Successfully resized \"%s\" to %s bytes (%"PRIu64" bytes lost due to blocksize).",
+ id, FORMAT_BYTES(newsize), size - newsize);
+ }
+
+ return 0;
+}
+
+int partition_pick_mount_options(
+ PartitionDesignator d,
+ const char *fstype,
+ bool rw,
+ bool discard,
+ char **ret_options,
+ unsigned long *ret_ms_flags) {
+
+ _cleanup_free_ char *options = NULL;
+
+ assert(ret_options);
+
+ /* Selects a baseline of bind mount flags, that should always apply.
+ *
+ * Firstly, we set MS_NODEV universally on all mounts, since we don't want to allow device nodes outside of /dev/.
+ *
+ * On /var/tmp/ we'll also set MS_NOSUID, same as we set for /tmp/ on the host.
+ *
+ * On the ESP and XBOOTLDR partitions we'll also disable symlinks, and execution. These file systems
+ * are generally untrusted (i.e. not encrypted or authenticated), and typically VFAT hence we should
+ * be as restrictive as possible, and this shouldn't hurt, since the functionality is not available
+ * there anyway. */
+
+ unsigned long flags = MS_NODEV;
+
+ if (!rw)
+ flags |= MS_RDONLY;
+
+ switch (d) {
+
+ case PARTITION_ESP:
+ case PARTITION_XBOOTLDR:
+ flags |= MS_NOSUID|MS_NOEXEC|ms_nosymfollow_supported();
+
+ /* The ESP might contain a pre-boot random seed. Let's make this unaccessible to regular
+ * userspace. ESP/XBOOTLDR is almost certainly VFAT, hence if we don't know assume it is. */
+ if (!fstype || fstype_can_umask(fstype))
+ if (!strextend_with_separator(&options, ",", "umask=0077"))
+ return -ENOMEM;
+ break;
+
+ case PARTITION_TMP:
+ flags |= MS_NOSUID;
+ break;
+
+ default:
+ break;
+ }
+
+ /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the
+ * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices)
+ * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses
+ * from the upper file system still get propagated through to the underlying file system,
+ * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify
+ * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to
+ * carry a per file system table here.
+ *
+ * Note that this means that we might not be able to mount corrupted file systems as read-only
+ * anymore (since in some cases the kernel implementations will refuse mounting when corrupted,
+ * read-only and "norecovery" is specified). But I think for the case of automatically determined
+ * mount options for loopback devices this is the right choice, since otherwise using the same
+ * loopback file twice even in read-only mode, is going to fail badly sooner or later. The use case of
+ * making reuse of the immutable images "just work" is more relevant to us than having read-only
+ * access that actually modifies stuff work on such image files. Or to say this differently: if
+ * people want their file systems to be fixed up they should just open them in writable mode, where
+ * all these problems don't exist. */
+ if (!rw && fstype && fstype_can_norecovery(fstype))
+ if (!strextend_with_separator(&options, ",", "norecovery"))
+ return -ENOMEM;
+
+ if (discard && fstype && fstype_can_discard(fstype))
+ if (!strextend_with_separator(&options, ",", "discard"))
+ return -ENOMEM;
+
+ if (!ret_ms_flags) /* Fold flags into option string if ret_flags specified as NULL */
+ if (!strextend_with_separator(&options, ",",
+ FLAGS_SET(flags, MS_RDONLY) ? "ro" : "rw",
+ FLAGS_SET(flags, MS_NODEV) ? "nodev" : "dev",
+ FLAGS_SET(flags, MS_NOSUID) ? "nosuid" : "suid",
+ FLAGS_SET(flags, MS_NOEXEC) ? "noexec" : "exec",
+ FLAGS_SET(flags, MS_NOSYMFOLLOW) ? "nosymfollow" : NULL))
+ /* NB: we suppress 'symfollow' here, since it's the default, and old /bin/mount might not know it */
+ return -ENOMEM;
+
+ if (ret_ms_flags)
+ *ret_ms_flags = flags;
+
+ *ret_options = TAKE_PTR(options);
+ return 0;
+}
+
+static bool need_user_mapping(uid_t uid_shift, uid_t uid_range) {
+
+ if (!uid_is_valid(uid_shift))
+ return false;
+
+ return uid_shift != 0 || uid_range != UINT32_MAX;
+}
+
+static int mount_partition(
+ PartitionDesignator d,
+ DissectedPartition *m,
+ const char *where,
+ const char *directory,
+ uid_t uid_shift,
+ uid_t uid_range,
+ int userns_fd,
+ DissectImageFlags flags) {
+
+ _cleanup_free_ char *chased = NULL, *options = NULL;
+ const char *p = NULL, *node, *fstype = NULL;
+ bool rw, discard, grow;
+ unsigned long ms_flags;
+ int r;
+
+ assert(m);
+
+ if (!m->found)
+ return 0;
+
+ /* Check the various combinations when we can't do anything anymore */
+ if (m->fsmount_fd < 0 && m->mount_node_fd < 0)
+ return 0;
+ if (m->fsmount_fd >= 0 && !where)
+ return 0;
+ if (!where && m->mount_node_fd < 0)
+ return 0;
+
+ if (m->fsmount_fd < 0) {
+ fstype = dissected_partition_fstype(m);
+ if (!fstype)
+ return -EAFNOSUPPORT;
+
+ /* We are looking at an encrypted partition? This either means stacked encryption, or the
+ * caller didn't call dissected_image_decrypt() beforehand. Let's return a recognizable error
+ * for this case. */
+ if (streq(fstype, "crypto_LUKS"))
+ return -EUNATCH;
+
+ r = dissect_fstype_ok(fstype);
+ if (r < 0)
+ return r;
+ if (!r)
+ return -EIDRM; /* Recognizable error */
+ }
+
+ node = m->mount_node_fd < 0 ? NULL : FORMAT_PROC_FD_PATH(m->mount_node_fd);
+ rw = m->rw && !(flags & DISSECT_IMAGE_MOUNT_READ_ONLY);
+
+ discard = ((flags & DISSECT_IMAGE_DISCARD) ||
+ ((flags & DISSECT_IMAGE_DISCARD_ON_LOOP) && (m->node && is_loop_device(m->node) > 0)));
+
+ grow = rw && m->growfs && FLAGS_SET(flags, DISSECT_IMAGE_GROWFS);
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_FSCK) && rw && m->mount_node_fd >= 0 && m->fsmount_fd < 0) {
+ r = run_fsck(m->mount_node_fd, fstype);
+ if (r < 0)
+ return r;
+ }
+
+ if (where) {
+ if (directory) {
+ /* Automatically create missing mount points inside the image, if necessary. */
+ r = mkdir_p_root(where, directory, uid_shift, (gid_t) uid_shift, 0755, NULL);
+ if (r < 0 && r != -EROFS)
+ return r;
+
+ r = chase(directory, where, CHASE_PREFIX_ROOT, &chased, NULL);
+ if (r < 0)
+ return r;
+
+ p = chased;
+ } else {
+ /* Create top-level mount if missing – but only if this is asked for. This won't modify the
+ * image (as the branch above does) but the host hierarchy, and the created directory might
+ * survive our mount in the host hierarchy hence. */
+ if (FLAGS_SET(flags, DISSECT_IMAGE_MKDIR)) {
+ r = mkdir_p(where, 0755);
+ if (r < 0)
+ return r;
+ }
+
+ p = where;
+ }
+ }
+
+ if (m->fsmount_fd < 0) {
+ r = partition_pick_mount_options(d, fstype, rw, discard, &options, &ms_flags);
+ if (r < 0)
+ return r;
+
+ if (need_user_mapping(uid_shift, uid_range) && fstype_can_uid_gid(fstype)) {
+ _cleanup_free_ char *uid_option = NULL;
+
+ if (asprintf(&uid_option, "uid=" UID_FMT ",gid=" GID_FMT, uid_shift, (gid_t) uid_shift) < 0)
+ return -ENOMEM;
+
+ if (!strextend_with_separator(&options, ",", uid_option))
+ return -ENOMEM;
+
+ userns_fd = -EBADF; /* Not needed */
+ }
+
+ if (!isempty(m->mount_options))
+ if (!strextend_with_separator(&options, ",", m->mount_options))
+ return -ENOMEM;
+ }
+
+ if (p) {
+ if (m->fsmount_fd >= 0) {
+ /* Case #1: Attach existing fsmount fd to the file system */
+
+ r = mount_exchange_graceful(
+ m->fsmount_fd,
+ p,
+ FLAGS_SET(flags, DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mount image on '%s': %m", p);
+
+ } else {
+ assert(node);
+
+ /* Case #2: Mount directly into place */
+ r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, ms_flags, options);
+ if (r < 0)
+ return r;
+
+ if (grow)
+ (void) fs_grow(node, -EBADF, p);
+
+ if (userns_fd >= 0) {
+ r = remount_idmap_fd(STRV_MAKE(p), userns_fd);
+ if (r < 0)
+ return r;
+ }
+ }
+ } else {
+ assert(node);
+
+ /* Case #3: Create fsmount fd */
+
+ m->fsmount_fd = make_fsmount(LOG_DEBUG, node, fstype, ms_flags, options, userns_fd);
+ if (m->fsmount_fd < 0)
+ return m->fsmount_fd;
+
+ if (grow)
+ (void) fs_grow(node, m->fsmount_fd, NULL);
+ }
+
+ return 1;
+}
+
+static int mount_root_tmpfs(const char *where, uid_t uid_shift, uid_t uid_range, DissectImageFlags flags) {
+ _cleanup_free_ char *options = NULL;
+ int r;
+
+ assert(where);
+
+ /* For images that contain /usr/ but no rootfs, let's mount rootfs as tmpfs */
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_MKDIR)) {
+ r = mkdir_p(where, 0755);
+ if (r < 0)
+ return r;
+ }
+
+ if (need_user_mapping(uid_shift, uid_range)) {
+ if (asprintf(&options, "uid=" UID_FMT ",gid=" GID_FMT, uid_shift, (gid_t) uid_shift) < 0)
+ return -ENOMEM;
+ }
+
+ r = mount_nofollow_verbose(LOG_DEBUG, "rootfs", where, "tmpfs", MS_NODEV, options);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mount_point_is_available(const char *where, const char *path, bool missing_ok) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ /* Check whether <path> is suitable as a mountpoint, i.e. is an empty directory
+ * or does not exist at all (when missing_ok). */
+
+ r = chase(path, where, CHASE_PREFIX_ROOT, &p, NULL);
+ if (r == -ENOENT)
+ return missing_ok;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to chase \"%s\": %m", path);
+
+ r = dir_is_empty(p, /* ignore_hidden_or_backup= */ false);
+ if (r == -ENOTDIR)
+ return false;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check directory \"%s\": %m", p);
+ return r > 0;
+}
+
+int dissected_image_mount(
+ DissectedImage *m,
+ const char *where,
+ uid_t uid_shift,
+ uid_t uid_range,
+ int userns_fd,
+ DissectImageFlags flags) {
+
+ _cleanup_close_ int my_userns_fd = -EBADF;
+ int r;
+
+ assert(m);
+
+ /* If 'where' is NULL then we'll use the new mount API to create fsmount() fds for the mounts and
+ * store them in DissectedPartition.fsmount_fd.
+ *
+ * If 'where' is not NULL then we'll either mount the partitions to the right places ourselves,
+ * or use DissectedPartition.fsmount_fd and bind it to the right places.
+ *
+ * This allows splitting the setting up up the superblocks and the binding to file systems paths into
+ * two distinct and differently privileged components: one that gets the fsmount fds, and the other
+ * that then applies them.
+ *
+ * Returns:
+ *
+ * -ENXIO → No root partition found
+ * -EMEDIUMTYPE → DISSECT_IMAGE_VALIDATE_OS set but no os-release/extension-release file found
+ * -EUNATCH → Encrypted partition found for which no dm-crypt was set up yet
+ * -EUCLEAN → fsck for file system failed
+ * -EBUSY → File system already mounted/used elsewhere (kernel)
+ * -EAFNOSUPPORT → File system type not supported or not known
+ * -EIDRM → File system is not among allowlisted "common" file systems
+ */
+
+ if (!where && (flags & (DISSECT_IMAGE_VALIDATE_OS|DISSECT_IMAGE_VALIDATE_OS_EXT)) != 0)
+ return -EOPNOTSUPP; /* for now, not supported */
+
+ if (!(m->partitions[PARTITION_ROOT].found ||
+ (m->partitions[PARTITION_USR].found && FLAGS_SET(flags, DISSECT_IMAGE_USR_NO_ROOT))))
+ return -ENXIO; /* Require a root fs or at least a /usr/ fs (the latter is subject to a flag of its own) */
+
+ if (userns_fd < 0 && need_user_mapping(uid_shift, uid_range) && FLAGS_SET(flags, DISSECT_IMAGE_MOUNT_IDMAPPED)) {
+
+ my_userns_fd = make_userns(uid_shift, uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
+ if (my_userns_fd < 0)
+ return my_userns_fd;
+
+ userns_fd = my_userns_fd;
+ }
+
+ if ((flags & DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY) == 0) {
+
+ /* First mount the root fs. If there's none we use a tmpfs. */
+ if (m->partitions[PARTITION_ROOT].found) {
+ r = mount_partition(PARTITION_ROOT, m->partitions + PARTITION_ROOT, where, NULL, uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+
+ } else if (where) {
+ r = mount_root_tmpfs(where, uid_shift, uid_range, flags);
+ if (r < 0)
+ return r;
+ }
+
+ /* For us mounting root always means mounting /usr as well */
+ r = mount_partition(PARTITION_USR, m->partitions + PARTITION_USR, where, "/usr", uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+ }
+
+ if ((flags & DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY) == 0 &&
+ (flags & (DISSECT_IMAGE_VALIDATE_OS|DISSECT_IMAGE_VALIDATE_OS_EXT)) != 0) {
+ /* If either one of the validation flags are set, ensure that the image qualifies as
+ * one or the other (or both). */
+ bool ok = false;
+
+ assert(where);
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_VALIDATE_OS)) {
+ r = path_is_os_tree(where);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ ok = true;
+ }
+ if (!ok && FLAGS_SET(flags, DISSECT_IMAGE_VALIDATE_OS_EXT) && m->image_name) {
+ r = extension_has_forbidden_content(where);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ r = path_is_extension_tree(IMAGE_SYSEXT, where, m->image_name, FLAGS_SET(flags, DISSECT_IMAGE_RELAX_EXTENSION_CHECK));
+ if (r == 0)
+ r = path_is_extension_tree(IMAGE_CONFEXT, where, m->image_name, FLAGS_SET(flags, DISSECT_IMAGE_RELAX_EXTENSION_CHECK));
+ if (r < 0)
+ return r;
+ if (r > 0)
+ ok = true;
+ }
+ }
+
+ if (!ok)
+ return -ENOMEDIUM;
+ }
+
+ if (flags & DISSECT_IMAGE_MOUNT_ROOT_ONLY)
+ return 0;
+
+ r = mount_partition(PARTITION_HOME, m->partitions + PARTITION_HOME, where, "/home", uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+
+ r = mount_partition(PARTITION_SRV, m->partitions + PARTITION_SRV, where, "/srv", uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+
+ r = mount_partition(PARTITION_VAR, m->partitions + PARTITION_VAR, where, "/var", uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+
+ r = mount_partition(PARTITION_TMP, m->partitions + PARTITION_TMP, where, "/var/tmp", uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+
+ int slash_boot_is_available = 0;
+ if (where) {
+ r = slash_boot_is_available = mount_point_is_available(where, "/boot", /* missing_ok = */ true);
+ if (r < 0)
+ return r;
+ }
+ if (!where || slash_boot_is_available) {
+ r = mount_partition(PARTITION_XBOOTLDR, m->partitions + PARTITION_XBOOTLDR, where, "/boot", uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+ slash_boot_is_available = !r;
+ }
+
+ if (m->partitions[PARTITION_ESP].found) {
+ const char *esp_path = NULL;
+
+ if (where) {
+ /* Mount the ESP to /boot/ if it exists and is empty and we didn't already mount the
+ * XBOOTLDR partition into it. Otherwise, use /efi instead, but only if it exists
+ * and is empty. */
+
+ if (slash_boot_is_available) {
+ r = mount_point_is_available(where, "/boot", /* missing_ok = */ false);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ esp_path = "/boot";
+ }
+
+ if (!esp_path) {
+ r = mount_point_is_available(where, "/efi", /* missing_ok = */ true);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ esp_path = "/efi";
+ }
+ }
+
+ /* OK, let's mount the ESP now (possibly creating the dir if missing) */
+ r = mount_partition(PARTITION_ESP, m->partitions + PARTITION_ESP, where, esp_path, uid_shift, uid_range, userns_fd, flags);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int dissected_image_mount_and_warn(
+ DissectedImage *m,
+ const char *where,
+ uid_t uid_shift,
+ uid_t uid_range,
+ int userns_fd,
+ DissectImageFlags flags) {
+
+ int r;
+
+ assert(m);
+
+ r = dissected_image_mount(m, where, uid_shift, uid_range, userns_fd, flags);
+ if (r == -ENXIO)
+ return log_error_errno(r, "Not root file system found in image.");
+ if (r == -EMEDIUMTYPE)
+ return log_error_errno(r, "No suitable os-release/extension-release file in image found.");
+ if (r == -EUNATCH)
+ return log_error_errno(r, "Encrypted file system discovered, but decryption not requested.");
+ if (r == -EUCLEAN)
+ return log_error_errno(r, "File system check on image failed.");
+ if (r == -EBUSY)
+ return log_error_errno(r, "File system already mounted elsewhere.");
+ if (r == -EAFNOSUPPORT)
+ return log_error_errno(r, "File system type not supported or not known.");
+ if (r == -EIDRM)
+ return log_error_errno(r, "File system is too uncommon, refused.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to mount image: %m");
+
+ return r;
+}
+
+#if HAVE_LIBCRYPTSETUP
+struct DecryptedPartition {
+ struct crypt_device *device;
+ char *name;
+ bool relinquished;
+};
+#endif
+
+typedef struct DecryptedPartition DecryptedPartition;
+
+struct DecryptedImage {
+ unsigned n_ref;
+ DecryptedPartition *decrypted;
+ size_t n_decrypted;
+};
+
+static DecryptedImage* decrypted_image_free(DecryptedImage *d) {
+#if HAVE_LIBCRYPTSETUP
+ int r;
+
+ if (!d)
+ return NULL;
+
+ for (size_t i = 0; i < d->n_decrypted; i++) {
+ DecryptedPartition *p = d->decrypted + i;
+
+ if (p->device && p->name && !p->relinquished) {
+ _cleanup_free_ char *node = NULL;
+
+ node = path_join("/dev/mapper", p->name);
+ if (node) {
+ r = btrfs_forget_device(node);
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to forget btrfs device %s, ignoring: %m", node);
+ } else
+ log_oom_debug();
+
+ /* Let's deactivate lazily, as the dm volume may be already/still used by other processes. */
+ r = sym_crypt_deactivate_by_name(p->device, p->name, CRYPT_DEACTIVATE_DEFERRED);
+ if (r < 0)
+ log_debug_errno(r, "Failed to deactivate encrypted partition %s", p->name);
+ }
+
+ if (p->device)
+ sym_crypt_free(p->device);
+ free(p->name);
+ }
+
+ free(d->decrypted);
+ free(d);
+#endif
+ return NULL;
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(DecryptedImage, decrypted_image, decrypted_image_free);
+
+#if HAVE_LIBCRYPTSETUP
+static int decrypted_image_new(DecryptedImage **ret) {
+ _cleanup_(decrypted_image_unrefp) DecryptedImage *d = NULL;
+
+ assert(ret);
+
+ d = new(DecryptedImage, 1);
+ if (!d)
+ return -ENOMEM;
+
+ *d = (DecryptedImage) {
+ .n_ref = 1,
+ };
+
+ *ret = TAKE_PTR(d);
+ return 0;
+}
+
+static int make_dm_name_and_node(const void *original_node, const char *suffix, char **ret_name, char **ret_node) {
+ _cleanup_free_ char *name = NULL, *node = NULL;
+ const char *base;
+
+ assert(original_node);
+ assert(suffix);
+ assert(ret_name);
+ assert(ret_node);
+
+ base = strrchr(original_node, '/');
+ if (!base)
+ base = original_node;
+ else
+ base++;
+ if (isempty(base))
+ return -EINVAL;
+
+ name = strjoin(base, suffix);
+ if (!name)
+ return -ENOMEM;
+ if (!filename_is_valid(name))
+ return -EINVAL;
+
+ node = path_join(sym_crypt_get_dir(), name);
+ if (!node)
+ return -ENOMEM;
+
+ *ret_name = TAKE_PTR(name);
+ *ret_node = TAKE_PTR(node);
+
+ return 0;
+}
+
+static int decrypt_partition(
+ DissectedPartition *m,
+ const char *passphrase,
+ DissectImageFlags flags,
+ DecryptedImage *d) {
+
+ _cleanup_free_ char *node = NULL, *name = NULL;
+ _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(m);
+ assert(d);
+
+ if (!m->found || !m->node || !m->fstype)
+ return 0;
+
+ if (!streq(m->fstype, "crypto_LUKS"))
+ return 0;
+
+ if (!passphrase)
+ return -ENOKEY;
+
+ r = dlopen_cryptsetup();
+ if (r < 0)
+ return r;
+
+ r = make_dm_name_and_node(m->node, "-decrypted", &name, &node);
+ if (r < 0)
+ return r;
+
+ if (!GREEDY_REALLOC0(d->decrypted, d->n_decrypted + 1))
+ return -ENOMEM;
+
+ r = sym_crypt_init(&cd, m->node);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to initialize dm-crypt: %m");
+
+ cryptsetup_enable_logging(cd);
+
+ r = sym_crypt_load(cd, CRYPT_LUKS, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load LUKS metadata: %m");
+
+ r = sym_crypt_activate_by_passphrase(cd, name, CRYPT_ANY_SLOT, passphrase, strlen(passphrase),
+ ((flags & DISSECT_IMAGE_DEVICE_READ_ONLY) ? CRYPT_ACTIVATE_READONLY : 0) |
+ ((flags & DISSECT_IMAGE_DISCARD_ON_CRYPTO) ? CRYPT_ACTIVATE_ALLOW_DISCARDS : 0));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to activate LUKS device: %m");
+ return r == -EPERM ? -EKEYREJECTED : r;
+ }
+
+ fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to open %s: %m", node);
+
+ d->decrypted[d->n_decrypted++] = (DecryptedPartition) {
+ .name = TAKE_PTR(name),
+ .device = TAKE_PTR(cd),
+ };
+
+ m->decrypted_node = TAKE_PTR(node);
+ close_and_replace(m->mount_node_fd, fd);
+
+ return 0;
+}
+
+static int verity_can_reuse(
+ const VeritySettings *verity,
+ const char *name,
+ struct crypt_device **ret_cd) {
+
+ /* If the same volume was already open, check that the root hashes match, and reuse it if they do */
+ _cleanup_free_ char *root_hash_existing = NULL;
+ _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL;
+ struct crypt_params_verity crypt_params = {};
+ size_t root_hash_existing_size;
+ int r;
+
+ assert(verity);
+ assert(name);
+ assert(ret_cd);
+
+ r = sym_crypt_init_by_name(&cd, name);
+ if (r < 0)
+ return log_debug_errno(r, "Error opening verity device, crypt_init_by_name failed: %m");
+
+ cryptsetup_enable_logging(cd);
+
+ r = sym_crypt_get_verity_info(cd, &crypt_params);
+ if (r < 0)
+ return log_debug_errno(r, "Error opening verity device, crypt_get_verity_info failed: %m");
+
+ root_hash_existing_size = verity->root_hash_size;
+ root_hash_existing = malloc0(root_hash_existing_size);
+ if (!root_hash_existing)
+ return -ENOMEM;
+
+ r = sym_crypt_volume_key_get(cd, CRYPT_ANY_SLOT, root_hash_existing, &root_hash_existing_size, NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Error opening verity device, crypt_volume_key_get failed: %m");
+ if (verity->root_hash_size != root_hash_existing_size ||
+ memcmp(root_hash_existing, verity->root_hash, verity->root_hash_size) != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Error opening verity device, it already exists but root hashes are different.");
+
+#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY
+ /* Ensure that, if signatures are supported, we only reuse the device if the previous mount used the
+ * same settings, so that a previous unsigned mount will not be reused if the user asks to use
+ * signing for the new one, and vice versa. */
+ if (!!verity->root_hash_sig != !!(crypt_params.flags & CRYPT_VERITY_ROOT_HASH_SIGNATURE))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Error opening verity device, it already exists but signature settings are not the same.");
+#endif
+
+ *ret_cd = TAKE_PTR(cd);
+ return 0;
+}
+
+static char* dm_deferred_remove_clean(char *name) {
+ if (!name)
+ return NULL;
+
+ (void) sym_crypt_deactivate_by_name(NULL, name, CRYPT_DEACTIVATE_DEFERRED);
+ return mfree(name);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char *, dm_deferred_remove_clean);
+
+static int validate_signature_userspace(const VeritySettings *verity) {
+#if HAVE_OPENSSL
+ _cleanup_(sk_X509_free_allp) STACK_OF(X509) *sk = NULL;
+ _cleanup_strv_free_ char **certs = NULL;
+ _cleanup_(PKCS7_freep) PKCS7 *p7 = NULL;
+ _cleanup_free_ char *s = NULL;
+ _cleanup_(BIO_freep) BIO *bio = NULL; /* 'bio' must be freed first, 's' second, hence keep this order
+ * of declaration in place, please */
+ const unsigned char *d;
+ int r;
+
+ assert(verity);
+ assert(verity->root_hash);
+ assert(verity->root_hash_sig);
+
+ /* Because installing a signature certificate into the kernel chain is so messy, let's optionally do
+ * userspace validation. */
+
+ r = conf_files_list_nulstr(&certs, ".crt", NULL, CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, CONF_PATHS_NULSTR("verity.d"));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to enumerate certificates: %m");
+ if (strv_isempty(certs)) {
+ log_debug("No userspace dm-verity certificates found.");
+ return 0;
+ }
+
+ d = verity->root_hash_sig;
+ p7 = d2i_PKCS7(NULL, &d, (long) verity->root_hash_sig_size);
+ if (!p7)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to parse PKCS7 DER signature data.");
+
+ s = hexmem(verity->root_hash, verity->root_hash_size);
+ if (!s)
+ return log_oom_debug();
+
+ bio = BIO_new_mem_buf(s, strlen(s));
+ if (!bio)
+ return log_oom_debug();
+
+ sk = sk_X509_new_null();
+ if (!sk)
+ return log_oom_debug();
+
+ STRV_FOREACH(i, certs) {
+ _cleanup_(X509_freep) X509 *c = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+
+ f = fopen(*i, "re");
+ if (!f) {
+ log_debug_errno(errno, "Failed to open '%s', ignoring: %m", *i);
+ continue;
+ }
+
+ c = PEM_read_X509(f, NULL, NULL, NULL);
+ if (!c) {
+ log_debug("Failed to load X509 certificate '%s', ignoring.", *i);
+ continue;
+ }
+
+ if (sk_X509_push(sk, c) == 0)
+ return log_oom_debug();
+
+ TAKE_PTR(c);
+ }
+
+ r = PKCS7_verify(p7, sk, NULL, bio, NULL, PKCS7_NOINTERN|PKCS7_NOVERIFY);
+ if (r)
+ log_debug("Userspace PKCS#7 validation succeeded.");
+ else
+ log_debug("Userspace PKCS#7 validation failed: %s", ERR_error_string(ERR_get_error(), NULL));
+
+ return r;
+#else
+ log_debug("Not doing client-side validation of dm-verity root hash signatures, OpenSSL support disabled.");
+ return 0;
+#endif
+}
+
+static int do_crypt_activate_verity(
+ struct crypt_device *cd,
+ const char *name,
+ const VeritySettings *verity) {
+
+ bool check_signature;
+ int r, k;
+
+ assert(cd);
+ assert(name);
+ assert(verity);
+
+ if (verity->root_hash_sig) {
+ r = getenv_bool_secure("SYSTEMD_DISSECT_VERITY_SIGNATURE");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_DISSECT_VERITY_SIGNATURE");
+
+ check_signature = r != 0;
+ } else
+ check_signature = false;
+
+ if (check_signature) {
+
+#if HAVE_CRYPT_ACTIVATE_BY_SIGNED_KEY
+ /* First, if we have support for signed keys in the kernel, then try that first. */
+ r = sym_crypt_activate_by_signed_key(
+ cd,
+ name,
+ verity->root_hash,
+ verity->root_hash_size,
+ verity->root_hash_sig,
+ verity->root_hash_sig_size,
+ CRYPT_ACTIVATE_READONLY);
+ if (r >= 0)
+ return r;
+
+ log_debug_errno(r, "Validation of dm-verity signature failed via the kernel, trying userspace validation instead: %m");
+#else
+ log_debug("Activation of verity device with signature requested, but not supported via the kernel by %s due to missing crypt_activate_by_signed_key(), trying userspace validation instead.",
+ program_invocation_short_name);
+ r = 0; /* Set for the propagation below */
+#endif
+
+ /* So this didn't work via the kernel, then let's try userspace validation instead. If that
+ * works we'll try to activate without telling the kernel the signature. */
+
+ /* Preferably propagate the original kernel error, so that the fallback logic can work,
+ * as the device-mapper is finicky around concurrent activations of the same volume */
+ k = validate_signature_userspace(verity);
+ if (k < 0)
+ return r < 0 ? r : k;
+ if (k == 0)
+ return log_debug_errno(r < 0 ? r : SYNTHETIC_ERRNO(ENOKEY),
+ "Activation of signed Verity volume worked neither via the kernel nor in userspace, can't activate.");
+ }
+
+ return sym_crypt_activate_by_volume_key(
+ cd,
+ name,
+ verity->root_hash,
+ verity->root_hash_size,
+ CRYPT_ACTIVATE_READONLY);
+}
+
+static usec_t verity_timeout(void) {
+ usec_t t = 100 * USEC_PER_MSEC;
+ const char *e;
+ int r;
+
+ /* On slower machines, like non-KVM vm, setting up device may take a long time.
+ * Let's make the timeout configurable. */
+
+ e = getenv("SYSTEMD_DISSECT_VERITY_TIMEOUT_SEC");
+ if (!e)
+ return t;
+
+ r = parse_sec(e, &t);
+ if (r < 0)
+ log_debug_errno(r,
+ "Failed to parse timeout specified in $SYSTEMD_DISSECT_VERITY_TIMEOUT_SEC, "
+ "using the default timeout (%s).",
+ FORMAT_TIMESPAN(t, USEC_PER_MSEC));
+
+ return t;
+}
+
+static int verity_partition(
+ PartitionDesignator designator,
+ DissectedPartition *m,
+ DissectedPartition *v,
+ const VeritySettings *verity,
+ DissectImageFlags flags,
+ DecryptedImage *d) {
+
+ _cleanup_(sym_crypt_freep) struct crypt_device *cd = NULL;
+ _cleanup_free_ char *node = NULL, *name = NULL;
+ _cleanup_close_ int mount_node_fd = -EBADF;
+ int r;
+
+ assert(m);
+ assert(v || (verity && verity->data_path));
+
+ if (!verity || !verity->root_hash)
+ return 0;
+ if (!((verity->designator < 0 && designator == PARTITION_ROOT) ||
+ (verity->designator == designator)))
+ return 0;
+
+ if (!m->found || !m->node || !m->fstype)
+ return 0;
+ if (!verity->data_path) {
+ if (!v->found || !v->node || !v->fstype)
+ return 0;
+
+ if (!streq(v->fstype, "DM_verity_hash"))
+ return 0;
+ }
+
+ r = dlopen_cryptsetup();
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) {
+ /* Use the roothash, which is unique per volume, as the device node name, so that it can be reused */
+ _cleanup_free_ char *root_hash_encoded = NULL;
+
+ root_hash_encoded = hexmem(verity->root_hash, verity->root_hash_size);
+ if (!root_hash_encoded)
+ return -ENOMEM;
+
+ r = make_dm_name_and_node(root_hash_encoded, "-verity", &name, &node);
+ } else
+ r = make_dm_name_and_node(m->node, "-verity", &name, &node);
+ if (r < 0)
+ return r;
+
+ r = sym_crypt_init(&cd, verity->data_path ?: v->node);
+ if (r < 0)
+ return r;
+
+ cryptsetup_enable_logging(cd);
+
+ r = sym_crypt_load(cd, CRYPT_VERITY, NULL);
+ if (r < 0)
+ return r;
+
+ r = sym_crypt_set_data_device(cd, m->node);
+ if (r < 0)
+ return r;
+
+ if (!GREEDY_REALLOC0(d->decrypted, d->n_decrypted + 1))
+ return -ENOMEM;
+
+ /* If activating fails because the device already exists, check the metadata and reuse it if it matches.
+ * In case of ENODEV/ENOENT, which can happen if another process is activating at the exact same time,
+ * retry a few times before giving up. */
+ for (unsigned i = 0; i < N_DEVICE_NODE_LIST_ATTEMPTS; i++) {
+ _cleanup_(dm_deferred_remove_cleanp) char *restore_deferred_remove = NULL;
+ _cleanup_(sym_crypt_freep) struct crypt_device *existing_cd = NULL;
+ _cleanup_close_ int fd = -EBADF;
+
+ /* First, check if the device already exists. */
+ fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0 && !ERRNO_IS_DEVICE_ABSENT(errno))
+ return log_debug_errno(errno, "Failed to open verity device %s: %m", node);
+ if (fd >= 0)
+ goto check; /* The device already exists. Let's check it. */
+
+ /* The symlink to the device node does not exist yet. Assume not activated, and let's activate it. */
+ r = do_crypt_activate_verity(cd, name, verity);
+ if (r >= 0)
+ goto try_open; /* The device is activated. Let's open it. */
+ /* libdevmapper can return EINVAL when the device is already in the activation stage.
+ * There's no way to distinguish this situation from a genuine error due to invalid
+ * parameters, so immediately fall back to activating the device with a unique name.
+ * Improvements in libcrypsetup can ensure this never happens:
+ * https://gitlab.com/cryptsetup/cryptsetup/-/merge_requests/96 */
+ if (r == -EINVAL && FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE))
+ break;
+ if (r == -ENODEV) /* Volume is being opened but not ready, crypt_init_by_name would fail, try to open again */
+ goto try_again;
+ if (!IN_SET(r,
+ -EEXIST, /* Volume has already been opened and ready to be used. */
+ -EBUSY /* Volume is being opened but not ready, crypt_init_by_name() can fetch details. */))
+ return log_debug_errno(r, "Failed to activate verity device %s: %m", node);
+
+ check:
+ /* To avoid races, disable automatic removal on umount while setting up the new device. Restore it on failure. */
+ r = dm_deferred_remove_cancel(name);
+ /* -EBUSY and -ENXIO: the device has already been removed or being removed. We cannot
+ * use the device, try to open again. See target_message() in drivers/md/dm-ioctl.c
+ * and dm_cancel_deferred_remove() in drivers/md/dm.c */
+ if (IN_SET(r, -EBUSY, -ENXIO))
+ goto try_again;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to disable automated deferred removal for verity device %s: %m", node);
+
+ restore_deferred_remove = strdup(name);
+ if (!restore_deferred_remove)
+ return log_oom_debug();
+
+ r = verity_can_reuse(verity, name, &existing_cd);
+ /* Same as above, -EINVAL can randomly happen when it actually means -EEXIST */
+ if (r == -EINVAL && FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE))
+ break;
+ if (IN_SET(r,
+ -ENOENT, /* Removed?? */
+ -EBUSY, /* Volume is being opened but not ready, crypt_init_by_name() can fetch details. */
+ -ENODEV /* Volume is being opened but not ready, crypt_init_by_name() would fail, try to open again. */ ))
+ goto try_again;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check if existing verity device %s can be reused: %m", node);
+
+ if (fd < 0) {
+ /* devmapper might say that the device exists, but the devlink might not yet have been
+ * created. Check and wait for the udev event in that case. */
+ r = device_wait_for_devlink(node, "block", verity_timeout(), NULL);
+ /* Fallback to activation with a unique device if it's taking too long */
+ if (r == -ETIMEDOUT && FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE))
+ break;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to wait device node symlink %s: %m", node);
+ }
+
+ try_open:
+ if (fd < 0) {
+ /* Now, the device is activated and devlink is created. Let's open it. */
+ fd = open(node, O_RDONLY|O_NONBLOCK|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0) {
+ if (!ERRNO_IS_DEVICE_ABSENT(errno))
+ return log_debug_errno(errno, "Failed to open verity device %s: %m", node);
+
+ /* The device has already been removed?? */
+ goto try_again;
+ }
+ }
+
+ /* Everything looks good and we'll be able to mount the device, so deferred remove will be re-enabled at that point. */
+ restore_deferred_remove = mfree(restore_deferred_remove);
+
+ mount_node_fd = TAKE_FD(fd);
+ if (existing_cd)
+ crypt_free_and_replace(cd, existing_cd);
+
+ goto success;
+
+ try_again:
+ /* Device is being removed by another process. Let's wait for a while. */
+ (void) usleep_safe(2 * USEC_PER_MSEC);
+ }
+
+ /* All trials failed or a conflicting verity device exists. Let's try to activate with a unique name. */
+ if (FLAGS_SET(flags, DISSECT_IMAGE_VERITY_SHARE)) {
+ /* Before trying to activate with unique name, we need to free crypt_device object.
+ * Otherwise, we get error from libcryptsetup like the following:
+ * ------
+ * systemd[1234]: Cannot use device /dev/loop5 which is in use (already mapped or mounted).
+ * ------
+ */
+ sym_crypt_free(cd);
+ cd = NULL;
+ return verity_partition(designator, m, v, verity, flags & ~DISSECT_IMAGE_VERITY_SHARE, d);
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "All attempts to activate verity device %s failed.", name);
+
+success:
+ d->decrypted[d->n_decrypted++] = (DecryptedPartition) {
+ .name = TAKE_PTR(name),
+ .device = TAKE_PTR(cd),
+ };
+
+ m->decrypted_node = TAKE_PTR(node);
+ close_and_replace(m->mount_node_fd, mount_node_fd);
+
+ return 0;
+}
+#endif
+
+int dissected_image_decrypt(
+ DissectedImage *m,
+ const char *passphrase,
+ const VeritySettings *verity,
+ DissectImageFlags flags) {
+
+#if HAVE_LIBCRYPTSETUP
+ _cleanup_(decrypted_image_unrefp) DecryptedImage *d = NULL;
+ int r;
+#endif
+
+ assert(m);
+ assert(!verity || verity->root_hash || verity->root_hash_size == 0);
+
+ /* Returns:
+ *
+ * = 0 → There was nothing to decrypt
+ * > 0 → Decrypted successfully
+ * -ENOKEY → There's something to decrypt but no key was supplied
+ * -EKEYREJECTED → Passed key was not correct
+ */
+
+ if (verity && verity->root_hash && verity->root_hash_size < sizeof(sd_id128_t))
+ return -EINVAL;
+
+ if (!m->encrypted && !m->verity_ready)
+ return 0;
+
+#if HAVE_LIBCRYPTSETUP
+ r = decrypted_image_new(&d);
+ if (r < 0)
+ return r;
+
+ for (PartitionDesignator i = 0; i < _PARTITION_DESIGNATOR_MAX; i++) {
+ DissectedPartition *p = m->partitions + i;
+ PartitionDesignator k;
+
+ if (!p->found)
+ continue;
+
+ r = decrypt_partition(p, passphrase, flags, d);
+ if (r < 0)
+ return r;
+
+ k = partition_verity_of(i);
+ if (k >= 0) {
+ r = verity_partition(i, p, m->partitions + k, verity, flags | DISSECT_IMAGE_VERITY_SHARE, d);
+ if (r < 0)
+ return r;
+ }
+
+ if (!p->decrypted_fstype && p->mount_node_fd >= 0 && p->decrypted_node) {
+ r = probe_filesystem_full(p->mount_node_fd, p->decrypted_node, 0, UINT64_MAX, &p->decrypted_fstype);
+ if (r < 0 && r != -EUCLEAN)
+ return r;
+ }
+ }
+
+ m->decrypted_image = TAKE_PTR(d);
+
+ return 1;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+int dissected_image_decrypt_interactively(
+ DissectedImage *m,
+ const char *passphrase,
+ const VeritySettings *verity,
+ DissectImageFlags flags) {
+
+ _cleanup_strv_free_erase_ char **z = NULL;
+ int n = 3, r;
+
+ if (passphrase)
+ n--;
+
+ for (;;) {
+ r = dissected_image_decrypt(m, passphrase, verity, flags);
+ if (r >= 0)
+ return r;
+ if (r == -EKEYREJECTED)
+ log_error_errno(r, "Incorrect passphrase, try again!");
+ else if (r != -ENOKEY)
+ return log_error_errno(r, "Failed to decrypt image: %m");
+
+ if (--n < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EKEYREJECTED),
+ "Too many retries.");
+
+ z = strv_free(z);
+
+ r = ask_password_auto("Please enter image passphrase:", NULL, "dissect", "dissect", "dissect.passphrase", USEC_INFINITY, 0, &z);
+ if (r < 0)
+ return log_error_errno(r, "Failed to query for passphrase: %m");
+
+ passphrase = z[0];
+ }
+}
+
+static int decrypted_image_relinquish(DecryptedImage *d) {
+ assert(d);
+
+ /* Turns on automatic removal after the last use ended for all DM devices of this image, and sets a
+ * boolean so that we don't clean it up ourselves either anymore */
+
+#if HAVE_LIBCRYPTSETUP
+ int r;
+
+ for (size_t i = 0; i < d->n_decrypted; i++) {
+ DecryptedPartition *p = d->decrypted + i;
+
+ if (p->relinquished)
+ continue;
+
+ r = sym_crypt_deactivate_by_name(NULL, p->name, CRYPT_DEACTIVATE_DEFERRED);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mark %s for auto-removal: %m", p->name);
+
+ p->relinquished = true;
+ }
+#endif
+
+ return 0;
+}
+
+int dissected_image_relinquish(DissectedImage *m) {
+ int r;
+
+ assert(m);
+
+ if (m->decrypted_image) {
+ r = decrypted_image_relinquish(m->decrypted_image);
+ if (r < 0)
+ return r;
+ }
+
+ if (m->loop)
+ loop_device_relinquish(m->loop);
+
+ return 0;
+}
+
+static char *build_auxiliary_path(const char *image, const char *suffix) {
+ const char *e;
+ char *n;
+
+ assert(image);
+ assert(suffix);
+
+ e = endswith(image, ".raw");
+ if (!e)
+ return strjoin(e, suffix);
+
+ n = new(char, e - image + strlen(suffix) + 1);
+ if (!n)
+ return NULL;
+
+ strcpy(mempcpy(n, image, e - image), suffix);
+ return n;
+}
+
+void verity_settings_done(VeritySettings *v) {
+ assert(v);
+
+ v->root_hash = mfree(v->root_hash);
+ v->root_hash_size = 0;
+
+ v->root_hash_sig = mfree(v->root_hash_sig);
+ v->root_hash_sig_size = 0;
+
+ v->data_path = mfree(v->data_path);
+}
+
+int verity_settings_load(
+ VeritySettings *verity,
+ const char *image,
+ const char *root_hash_path,
+ const char *root_hash_sig_path) {
+
+ _cleanup_free_ void *root_hash = NULL, *root_hash_sig = NULL;
+ size_t root_hash_size = 0, root_hash_sig_size = 0;
+ _cleanup_free_ char *verity_data_path = NULL;
+ PartitionDesignator designator;
+ int r;
+
+ assert(verity);
+ assert(image);
+ assert(verity->designator < 0 || IN_SET(verity->designator, PARTITION_ROOT, PARTITION_USR));
+
+ /* If we are asked to load the root hash for a device node, exit early */
+ if (is_device_path(image))
+ return 0;
+
+ r = getenv_bool_secure("SYSTEMD_DISSECT_VERITY_SIDECAR");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_DISSECT_VERITY_SIDECAR, ignoring: %m");
+ if (r == 0)
+ return 0;
+
+ designator = verity->designator;
+
+ /* We only fill in what isn't already filled in */
+
+ if (!verity->root_hash) {
+ _cleanup_free_ char *text = NULL;
+
+ if (root_hash_path) {
+ /* If explicitly specified it takes precedence */
+ r = read_one_line_file(root_hash_path, &text);
+ if (r < 0)
+ return r;
+
+ if (designator < 0)
+ designator = PARTITION_ROOT;
+ } else {
+ /* Otherwise look for xattr and separate file, and first for the data for root and if
+ * that doesn't exist for /usr */
+
+ if (designator < 0 || designator == PARTITION_ROOT) {
+ r = getxattr_malloc(image, "user.verity.roothash", &text);
+ if (r < 0) {
+ _cleanup_free_ char *p = NULL;
+
+ if (r != -ENOENT && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ p = build_auxiliary_path(image, ".roothash");
+ if (!p)
+ return -ENOMEM;
+
+ r = read_one_line_file(p, &text);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ }
+
+ if (text)
+ designator = PARTITION_ROOT;
+ }
+
+ if (!text && (designator < 0 || designator == PARTITION_USR)) {
+ /* So in the "roothash" xattr/file name above the "root" of course primarily
+ * refers to the root of the Verity Merkle tree. But coincidentally it also
+ * is the hash for the *root* file system, i.e. the "root" neatly refers to
+ * two distinct concepts called "root". Taking benefit of this happy
+ * coincidence we call the file with the root hash for the /usr/ file system
+ * `usrhash`, because `usrroothash` or `rootusrhash` would just be too
+ * confusing. We thus drop the reference to the root of the Merkle tree, and
+ * just indicate which file system it's about. */
+ r = getxattr_malloc(image, "user.verity.usrhash", &text);
+ if (r < 0) {
+ _cleanup_free_ char *p = NULL;
+
+ if (r != -ENOENT && !ERRNO_IS_XATTR_ABSENT(r))
+ return r;
+
+ p = build_auxiliary_path(image, ".usrhash");
+ if (!p)
+ return -ENOMEM;
+
+ r = read_one_line_file(p, &text);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ }
+
+ if (text)
+ designator = PARTITION_USR;
+ }
+ }
+
+ if (text) {
+ r = unhexmem(text, strlen(text), &root_hash, &root_hash_size);
+ if (r < 0)
+ return r;
+ if (root_hash_size < sizeof(sd_id128_t))
+ return -EINVAL;
+ }
+ }
+
+ if ((root_hash || verity->root_hash) && !verity->root_hash_sig) {
+ if (root_hash_sig_path) {
+ r = read_full_file(root_hash_sig_path, (char**) &root_hash_sig, &root_hash_sig_size);
+ if (r < 0 && r != -ENOENT)
+ return r;
+
+ if (designator < 0)
+ designator = PARTITION_ROOT;
+ } else {
+ if (designator < 0 || designator == PARTITION_ROOT) {
+ _cleanup_free_ char *p = NULL;
+
+ /* Follow naming convention recommended by the relevant RFC:
+ * https://tools.ietf.org/html/rfc5751#section-3.2.1 */
+ p = build_auxiliary_path(image, ".roothash.p7s");
+ if (!p)
+ return -ENOMEM;
+
+ r = read_full_file(p, (char**) &root_hash_sig, &root_hash_sig_size);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ if (r >= 0)
+ designator = PARTITION_ROOT;
+ }
+
+ if (!root_hash_sig && (designator < 0 || designator == PARTITION_USR)) {
+ _cleanup_free_ char *p = NULL;
+
+ p = build_auxiliary_path(image, ".usrhash.p7s");
+ if (!p)
+ return -ENOMEM;
+
+ r = read_full_file(p, (char**) &root_hash_sig, &root_hash_sig_size);
+ if (r < 0 && r != -ENOENT)
+ return r;
+ if (r >= 0)
+ designator = PARTITION_USR;
+ }
+ }
+
+ if (root_hash_sig && root_hash_sig_size == 0) /* refuse empty size signatures */
+ return -EINVAL;
+ }
+
+ if (!verity->data_path) {
+ _cleanup_free_ char *p = NULL;
+
+ p = build_auxiliary_path(image, ".verity");
+ if (!p)
+ return -ENOMEM;
+
+ if (access(p, F_OK) < 0) {
+ if (errno != ENOENT)
+ return -errno;
+ } else
+ verity_data_path = TAKE_PTR(p);
+ }
+
+ if (root_hash) {
+ verity->root_hash = TAKE_PTR(root_hash);
+ verity->root_hash_size = root_hash_size;
+ }
+
+ if (root_hash_sig) {
+ verity->root_hash_sig = TAKE_PTR(root_hash_sig);
+ verity->root_hash_sig_size = root_hash_sig_size;
+ }
+
+ if (verity_data_path)
+ verity->data_path = TAKE_PTR(verity_data_path);
+
+ if (verity->designator < 0)
+ verity->designator = designator;
+
+ return 1;
+}
+
+int dissected_image_load_verity_sig_partition(
+ DissectedImage *m,
+ int fd,
+ VeritySettings *verity) {
+
+ _cleanup_free_ void *root_hash = NULL, *root_hash_sig = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ size_t root_hash_size, root_hash_sig_size;
+ _cleanup_free_ char *buf = NULL;
+ PartitionDesignator d;
+ DissectedPartition *p;
+ JsonVariant *rh, *sig;
+ ssize_t n;
+ char *e;
+ int r;
+
+ assert(m);
+ assert(fd >= 0);
+ assert(verity);
+
+ if (verity->root_hash && verity->root_hash_sig) /* Already loaded? */
+ return 0;
+
+ r = getenv_bool_secure("SYSTEMD_DISSECT_VERITY_EMBEDDED");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_DISSECT_VERITY_EMBEDDED, ignoring: %m");
+ if (r == 0)
+ return 0;
+
+ d = partition_verity_sig_of(verity->designator < 0 ? PARTITION_ROOT : verity->designator);
+ assert(d >= 0);
+
+ p = m->partitions + d;
+ if (!p->found)
+ return 0;
+ if (p->offset == UINT64_MAX || p->size == UINT64_MAX)
+ return -EINVAL;
+
+ if (p->size > 4*1024*1024) /* Signature data cannot possible be larger than 4M, refuse that */
+ return log_debug_errno(SYNTHETIC_ERRNO(EFBIG), "Verity signature partition is larger than 4M, refusing.");
+
+ buf = new(char, p->size+1);
+ if (!buf)
+ return -ENOMEM;
+
+ n = pread(fd, buf, p->size, p->offset);
+ if (n < 0)
+ return -ENOMEM;
+ if ((uint64_t) n != p->size)
+ return -EIO;
+
+ e = memchr(buf, 0, p->size);
+ if (e) {
+ /* If we found a NUL byte then the rest of the data must be NUL too */
+ if (!memeqzero(e, p->size - (e - buf)))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature data contains embedded NUL byte.");
+ } else
+ buf[p->size] = 0;
+
+ r = json_parse(buf, 0, &v, NULL, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse signature JSON data: %m");
+
+ rh = json_variant_by_key(v, "rootHash");
+ if (!rh)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature JSON object lacks 'rootHash' field.");
+ if (!json_variant_is_string(rh))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'rootHash' field of signature JSON object is not a string.");
+
+ r = unhexmem(json_variant_string(rh), SIZE_MAX, &root_hash, &root_hash_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse root hash field: %m");
+
+ /* Check if specified root hash matches if it is specified */
+ if (verity->root_hash &&
+ memcmp_nn(verity->root_hash, verity->root_hash_size, root_hash, root_hash_size) != 0) {
+ _cleanup_free_ char *a = NULL, *b = NULL;
+
+ a = hexmem(root_hash, root_hash_size);
+ b = hexmem(verity->root_hash, verity->root_hash_size);
+
+ return log_debug_errno(r, "Root hash in signature JSON data (%s) doesn't match configured hash (%s).", strna(a), strna(b));
+ }
+
+ sig = json_variant_by_key(v, "signature");
+ if (!sig)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature JSON object lacks 'signature' field.");
+ if (!json_variant_is_string(sig))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "'signature' field of signature JSON object is not a string.");
+
+ r = unbase64mem(json_variant_string(sig), SIZE_MAX, &root_hash_sig, &root_hash_sig_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse signature field: %m");
+
+ free_and_replace(verity->root_hash, root_hash);
+ verity->root_hash_size = root_hash_size;
+
+ free_and_replace(verity->root_hash_sig, root_hash_sig);
+ verity->root_hash_sig_size = root_hash_sig_size;
+
+ return 1;
+}
+
+int dissected_image_acquire_metadata(DissectedImage *m, DissectImageFlags extra_flags) {
+
+ enum {
+ META_HOSTNAME,
+ META_MACHINE_ID,
+ META_MACHINE_INFO,
+ META_OS_RELEASE,
+ META_INITRD_RELEASE,
+ META_SYSEXT_RELEASE,
+ META_CONFEXT_RELEASE,
+ META_HAS_INIT_SYSTEM,
+ _META_MAX,
+ };
+
+ static const char *const paths[_META_MAX] = {
+ [META_HOSTNAME] = "/etc/hostname\0",
+ [META_MACHINE_ID] = "/etc/machine-id\0",
+ [META_MACHINE_INFO] = "/etc/machine-info\0",
+ [META_OS_RELEASE] = "/etc/os-release\0"
+ "/usr/lib/os-release\0",
+ [META_INITRD_RELEASE] = "/etc/initrd-release\0"
+ "/usr/lib/initrd-release\0",
+ [META_SYSEXT_RELEASE] = "sysext-release\0", /* String used only for logging. */
+ [META_CONFEXT_RELEASE] = "confext-release\0", /* ditto */
+ [META_HAS_INIT_SYSTEM] = "has-init-system\0", /* ditto */
+ };
+
+ _cleanup_strv_free_ char **machine_info = NULL, **os_release = NULL, **initrd_release = NULL, **sysext_release = NULL, **confext_release = NULL;
+ _cleanup_close_pair_ int error_pipe[2] = EBADF_PAIR;
+ _cleanup_(rmdir_and_freep) char *t = NULL;
+ _cleanup_(sigkill_waitp) pid_t child = 0;
+ sd_id128_t machine_id = SD_ID128_NULL;
+ _cleanup_free_ char *hostname = NULL;
+ unsigned n_meta_initialized = 0;
+ int fds[2 * _META_MAX], r, v;
+ int has_init_system = -1;
+ ssize_t n;
+
+ BLOCK_SIGNALS(SIGCHLD);
+
+ assert(m);
+
+ for (; n_meta_initialized < _META_MAX; n_meta_initialized ++) {
+ if (!paths[n_meta_initialized]) {
+ fds[2*n_meta_initialized] = fds[2*n_meta_initialized+1] = -EBADF;
+ continue;
+ }
+
+ if (pipe2(fds + 2*n_meta_initialized, O_CLOEXEC) < 0) {
+ r = -errno;
+ goto finish;
+ }
+ }
+
+ r = mkdtemp_malloc("/tmp/dissect-XXXXXX", &t);
+ if (r < 0)
+ goto finish;
+
+ if (pipe2(error_pipe, O_CLOEXEC) < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ r = safe_fork("(sd-dissect)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, &child);
+ if (r < 0)
+ goto finish;
+ if (r == 0) {
+ /* Child in a new mount namespace */
+ error_pipe[0] = safe_close(error_pipe[0]);
+
+ r = dissected_image_mount(
+ m,
+ t,
+ /* uid_shift= */ UID_INVALID,
+ /* uid_range= */ UID_INVALID,
+ /* userns_fd= */ -EBADF,
+ extra_flags |
+ DISSECT_IMAGE_READ_ONLY |
+ DISSECT_IMAGE_MOUNT_ROOT_ONLY |
+ DISSECT_IMAGE_USR_NO_ROOT);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to mount dissected image: %m");
+ goto inner_fail;
+ }
+
+ for (unsigned k = 0; k < _META_MAX; k++) {
+ _cleanup_close_ int fd = -ENOENT;
+
+ if (!paths[k])
+ continue;
+
+ fds[2*k] = safe_close(fds[2*k]);
+
+ switch (k) {
+
+ case META_SYSEXT_RELEASE:
+ if (!m->image_name)
+ goto next;
+
+ /* As per the os-release spec, if the image is an extension it will have a
+ * file named after the image name in extension-release.d/ - we use the image
+ * name and try to resolve it with the extension-release helpers, as
+ * sometimes the image names are mangled on deployment and do not match
+ * anymore. Unlike other paths this is not fixed, and the image name can be
+ * mangled on deployment, so by calling into the helper we allow a fallback
+ * that matches on the first extension-release file found in the directory,
+ * if one named after the image cannot be found first. */
+ r = open_extension_release(
+ t,
+ IMAGE_SYSEXT,
+ m->image_name,
+ /* relax_extension_release_check= */ false,
+ /* ret_path= */ NULL,
+ &fd);
+ if (r < 0)
+ fd = r;
+ break;
+
+ case META_CONFEXT_RELEASE:
+ if (!m->image_name)
+ goto next;
+
+ /* As above */
+ r = open_extension_release(
+ t,
+ IMAGE_CONFEXT,
+ m->image_name,
+ /* relax_extension_release_check= */ false,
+ /* ret_path= */ NULL,
+ &fd);
+ if (r < 0)
+ fd = r;
+
+ break;
+
+ case META_HAS_INIT_SYSTEM: {
+ bool found = false;
+
+ FOREACH_STRING(init,
+ "/usr/lib/systemd/systemd", /* systemd on /usr/ merged system */
+ "/lib/systemd/systemd", /* systemd on /usr/ non-merged systems */
+ "/sbin/init") { /* traditional path the Linux kernel invokes */
+
+ r = chase(init, t, CHASE_PREFIX_ROOT, NULL, NULL);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_debug_errno(r, "Failed to resolve %s, ignoring: %m", init);
+ } else {
+ found = true;
+ break;
+ }
+ }
+
+ r = loop_write(fds[2*k+1], &found, sizeof(found));
+ if (r < 0)
+ goto inner_fail;
+
+ goto next;
+ }
+
+ default:
+ NULSTR_FOREACH(p, paths[k]) {
+ fd = chase_and_open(p, t, CHASE_PREFIX_ROOT, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL);
+ if (fd >= 0)
+ break;
+ }
+ }
+
+ if (fd < 0) {
+ log_debug_errno(fd, "Failed to read %s file of image, ignoring: %m", paths[k]);
+ goto next;
+ }
+
+ r = copy_bytes(fd, fds[2*k+1], UINT64_MAX, 0);
+ if (r < 0)
+ goto inner_fail;
+
+ next:
+ fds[2*k+1] = safe_close(fds[2*k+1]);
+ }
+
+ _exit(EXIT_SUCCESS);
+
+ inner_fail:
+ /* Let parent know the error */
+ (void) write(error_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ error_pipe[1] = safe_close(error_pipe[1]);
+
+ for (unsigned k = 0; k < _META_MAX; k++) {
+ _cleanup_fclose_ FILE *f = NULL;
+
+ if (!paths[k])
+ continue;
+
+ fds[2*k+1] = safe_close(fds[2*k+1]);
+
+ f = take_fdopen(&fds[2*k], "r");
+ if (!f) {
+ r = -errno;
+ goto finish;
+ }
+
+ switch (k) {
+
+ case META_HOSTNAME:
+ r = read_etc_hostname_stream(f, &hostname);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /etc/hostname of image: %m");
+
+ break;
+
+ case META_MACHINE_ID: {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /etc/machine-id of image: %m");
+ else if (r == 33) {
+ r = sd_id128_from_string(line, &machine_id);
+ if (r < 0)
+ log_debug_errno(r, "Image contains invalid /etc/machine-id: %s", line);
+ } else if (r == 0)
+ log_debug("/etc/machine-id file of image is empty.");
+ else if (streq(line, "uninitialized"))
+ log_debug("/etc/machine-id file of image is uninitialized (likely aborted first boot).");
+ else
+ log_debug("/etc/machine-id file of image has unexpected length %i.", r);
+
+ break;
+ }
+
+ case META_MACHINE_INFO:
+ r = load_env_file_pairs(f, "machine-info", &machine_info);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read /etc/machine-info of image: %m");
+
+ break;
+
+ case META_OS_RELEASE:
+ r = load_env_file_pairs(f, "os-release", &os_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read OS release file of image: %m");
+
+ break;
+
+ case META_INITRD_RELEASE:
+ r = load_env_file_pairs(f, "initrd-release", &initrd_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read initrd release file of image: %m");
+
+ break;
+
+ case META_SYSEXT_RELEASE:
+ r = load_env_file_pairs(f, "sysext-release", &sysext_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read sysext release file of image: %m");
+
+ break;
+
+ case META_CONFEXT_RELEASE:
+ r = load_env_file_pairs(f, "confext-release", &confext_release);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read confext release file of image: %m");
+
+ break;
+
+ case META_HAS_INIT_SYSTEM: {
+ bool b = false;
+ size_t nr;
+
+ errno = 0;
+ nr = fread(&b, 1, sizeof(b), f);
+ if (nr != sizeof(b))
+ log_debug_errno(errno_or_else(EIO), "Failed to read has-init-system boolean: %m");
+ else
+ has_init_system = b;
+
+ break;
+ }}
+ }
+
+ r = wait_for_terminate_and_check("(sd-dissect)", child, 0);
+ child = 0;
+ if (r < 0)
+ goto finish;
+
+ n = read(error_pipe[0], &v, sizeof(v));
+ if (n < 0) {
+ r = -errno;
+ goto finish;
+ }
+ if (n == sizeof(v)) {
+ r = v; /* propagate error sent to us from child */
+ goto finish;
+ }
+ if (n != 0) {
+ r = -EIO;
+ goto finish;
+ }
+ if (r != EXIT_SUCCESS) {
+ r = -EPROTO;
+ goto finish;
+ }
+
+ free_and_replace(m->hostname, hostname);
+ m->machine_id = machine_id;
+ strv_free_and_replace(m->machine_info, machine_info);
+ strv_free_and_replace(m->os_release, os_release);
+ strv_free_and_replace(m->initrd_release, initrd_release);
+ strv_free_and_replace(m->sysext_release, sysext_release);
+ strv_free_and_replace(m->confext_release, confext_release);
+ m->has_init_system = has_init_system;
+
+finish:
+ for (unsigned k = 0; k < n_meta_initialized; k++)
+ safe_close_pair(fds + 2*k);
+
+ return r;
+}
+
+Architecture dissected_image_architecture(DissectedImage *img) {
+ assert(img);
+
+ if (img->partitions[PARTITION_ROOT].found &&
+ img->partitions[PARTITION_ROOT].architecture >= 0)
+ return img->partitions[PARTITION_ROOT].architecture;
+
+ if (img->partitions[PARTITION_USR].found &&
+ img->partitions[PARTITION_USR].architecture >= 0)
+ return img->partitions[PARTITION_USR].architecture;
+
+ return _ARCHITECTURE_INVALID;
+}
+
+int dissect_loop_device(
+ LoopDevice *loop,
+ const VeritySettings *verity,
+ const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
+ DissectImageFlags flags,
+ DissectedImage **ret) {
+
+#if HAVE_BLKID
+ _cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
+ int r;
+
+ assert(loop);
+
+ r = dissected_image_new(loop->backing_file ?: loop->node, &m);
+ if (r < 0)
+ return r;
+
+ m->loop = loop_device_ref(loop);
+ m->sector_size = m->loop->sector_size;
+
+ r = dissect_image(m, loop->fd, loop->node, verity, mount_options, image_policy, flags);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ *ret = TAKE_PTR(m);
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+int dissect_loop_device_and_warn(
+ LoopDevice *loop,
+ const VeritySettings *verity,
+ const MountOptions *mount_options,
+ const ImagePolicy *image_policy,
+ DissectImageFlags flags,
+ DissectedImage **ret) {
+
+ assert(loop);
+
+ return dissect_log_error(
+ LOG_ERR,
+ dissect_loop_device(loop, verity, mount_options, image_policy, flags, ret),
+ loop->backing_file ?: loop->node,
+ verity);
+
+}
+
+bool dissected_image_verity_candidate(const DissectedImage *image, PartitionDesignator partition_designator) {
+ assert(image);
+
+ /* Checks if this partition could theoretically do Verity. For non-partitioned images this only works
+ * if there's an external verity file supplied, for which we can consult .has_verity. For partitioned
+ * images we only check the partition type.
+ *
+ * This call is used to decide whether to suppress or show a verity column in tabular output of the
+ * image. */
+
+ if (image->single_file_system)
+ return partition_designator == PARTITION_ROOT && image->has_verity;
+
+ return partition_verity_of(partition_designator) >= 0;
+}
+
+bool dissected_image_verity_ready(const DissectedImage *image, PartitionDesignator partition_designator) {
+ PartitionDesignator k;
+
+ assert(image);
+
+ /* Checks if this partition has verity data available that we can activate. For non-partitioned this
+ * works for the root partition, for others only if the associated verity partition was found. */
+
+ if (!image->verity_ready)
+ return false;
+
+ if (image->single_file_system)
+ return partition_designator == PARTITION_ROOT;
+
+ k = partition_verity_of(partition_designator);
+ return k >= 0 && image->partitions[k].found;
+}
+
+bool dissected_image_verity_sig_ready(const DissectedImage *image, PartitionDesignator partition_designator) {
+ PartitionDesignator k;
+
+ assert(image);
+
+ /* Checks if this partition has verity signature data available that we can use. */
+
+ if (!image->verity_sig_ready)
+ return false;
+
+ if (image->single_file_system)
+ return partition_designator == PARTITION_ROOT;
+
+ k = partition_verity_sig_of(partition_designator);
+ return k >= 0 && image->partitions[k].found;
+}
+
+MountOptions* mount_options_free_all(MountOptions *options) {
+ MountOptions *m;
+
+ while ((m = LIST_POP(mount_options, options))) {
+ free(m->options);
+ free(m);
+ }
+
+ return NULL;
+}
+
+const char* mount_options_from_designator(const MountOptions *options, PartitionDesignator designator) {
+ LIST_FOREACH(mount_options, m, options)
+ if (designator == m->partition_designator && !isempty(m->options))
+ return m->options;
+
+ return NULL;
+}
+
+int mount_image_privately_interactively(
+ const char *image,
+ const ImagePolicy *image_policy,
+ DissectImageFlags flags,
+ char **ret_directory,
+ int *ret_dir_fd,
+ LoopDevice **ret_loop_device) {
+
+ _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+ _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
+ _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ _cleanup_free_ char *dir = NULL;
+ int r;
+
+ /* Mounts an OS image at a temporary place, inside a newly created mount namespace of our own. This
+ * is used by tools such as systemd-tmpfiles or systemd-firstboot to operate on some disk image
+ * easily. */
+
+ assert(image);
+ assert(ret_loop_device);
+
+ /* We intend to mount this right-away, hence add the partitions if needed and pin them. */
+ flags |= DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+
+ r = verity_settings_load(&verity, image, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to load root hash data: %m");
+
+ r = loop_device_make_by_path(
+ image,
+ FLAGS_SET(flags, DISSECT_IMAGE_DEVICE_READ_ONLY) ? O_RDONLY : O_RDWR,
+ /* sector_size= */ UINT32_MAX,
+ FLAGS_SET(flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &d);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up loopback device for %s: %m", image);
+
+ r = dissect_loop_device_and_warn(
+ d,
+ &verity,
+ /* mount_options= */ NULL,
+ image_policy,
+ flags,
+ &dissected_image);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_load_verity_sig_partition(dissected_image, d->fd, &verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt_interactively(dissected_image, NULL, &verity, flags);
+ if (r < 0)
+ return r;
+
+ r = detach_mount_namespace();
+ if (r < 0)
+ return log_error_errno(r, "Failed to detach mount namespace: %m");
+
+ r = mkdir_p("/run/systemd/mount-rootfs", 0555);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create mount point: %m");
+
+ r = dissected_image_mount_and_warn(
+ dissected_image,
+ "/run/systemd/mount-rootfs",
+ /* uid_shift= */ UID_INVALID,
+ /* uid_range= */ UID_INVALID,
+ /* userns_fd= */ -EBADF,
+ flags);
+ if (r < 0)
+ return r;
+
+ r = loop_device_flock(d, LOCK_UN);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_relinquish(dissected_image);
+ if (r < 0)
+ return log_error_errno(r, "Failed to relinquish DM and loopback block devices: %m");
+
+ if (ret_directory) {
+ dir = strdup("/run/systemd/mount-rootfs");
+ if (!dir)
+ return log_oom();
+ }
+
+ if (ret_dir_fd) {
+ _cleanup_close_ int dir_fd = -EBADF;
+
+ dir_fd = open("/run/systemd/mount-rootfs", O_CLOEXEC|O_DIRECTORY);
+ if (dir_fd < 0)
+ return log_error_errno(errno, "Failed to open mount point directory: %m");
+
+ *ret_dir_fd = TAKE_FD(dir_fd);
+ }
+
+ if (ret_directory)
+ *ret_directory = TAKE_PTR(dir);
+
+ *ret_loop_device = TAKE_PTR(d);
+ return 0;
+}
+
+static bool mount_options_relax_extension_release_checks(const MountOptions *options) {
+ if (!options)
+ return false;
+
+ return string_contains_word(mount_options_from_designator(options, PARTITION_ROOT), ",", "x-systemd.relax-extension-release-check") ||
+ string_contains_word(mount_options_from_designator(options, PARTITION_USR), ",", "x-systemd.relax-extension-release-check") ||
+ string_contains_word(options->options, ",", "x-systemd.relax-extension-release-check");
+}
+
+int verity_dissect_and_mount(
+ int src_fd,
+ const char *src,
+ const char *dest,
+ const MountOptions *options,
+ const ImagePolicy *image_policy,
+ const char *required_host_os_release_id,
+ const char *required_host_os_release_version_id,
+ const char *required_host_os_release_sysext_level,
+ const char *required_host_os_release_confext_level,
+ const char *required_sysext_scope,
+ DissectedImage **ret_image) {
+
+ _cleanup_(loop_device_unrefp) LoopDevice *loop_device = NULL;
+ _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
+ _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
+ DissectImageFlags dissect_image_flags;
+ bool relax_extension_release_check;
+ int r;
+
+ assert(src);
+ /* Verifying release metadata requires mounted image for now, so ensure the check is skipped when
+ * opening an image without mounting it immediately (i.e.: 'dest' is NULL). */
+ assert(!required_host_os_release_id || dest);
+
+ relax_extension_release_check = mount_options_relax_extension_release_checks(options);
+
+ /* We might get an FD for the image, but we use the original path to look for the dm-verity files */
+ r = verity_settings_load(&verity, src, NULL, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load root hash: %m");
+
+ dissect_image_flags = (verity.data_path ? DISSECT_IMAGE_NO_PARTITION_TABLE : 0) |
+ (relax_extension_release_check ? DISSECT_IMAGE_RELAX_EXTENSION_CHECK : 0) |
+ DISSECT_IMAGE_ADD_PARTITION_DEVICES |
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES;
+
+ /* Note that we don't use loop_device_make here, as the FD is most likely O_PATH which would not be
+ * accepted by LOOP_CONFIGURE, so just let loop_device_make_by_path reopen it as a regular FD. */
+ r = loop_device_make_by_path(
+ src_fd >= 0 ? FORMAT_PROC_FD_PATH(src_fd) : src,
+ /* open_flags= */ -1,
+ /* sector_size= */ UINT32_MAX,
+ verity.data_path ? 0 : LO_FLAGS_PARTSCAN,
+ LOCK_SH,
+ &loop_device);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create loop device for image: %m");
+
+ r = dissect_loop_device(
+ loop_device,
+ &verity,
+ options,
+ image_policy,
+ dissect_image_flags,
+ &dissected_image);
+ /* No partition table? Might be a single-filesystem image, try again */
+ if (!verity.data_path && r == -ENOPKG)
+ r = dissect_loop_device(
+ loop_device,
+ &verity,
+ options,
+ image_policy,
+ dissect_image_flags | DISSECT_IMAGE_NO_PARTITION_TABLE,
+ &dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to dissect image: %m");
+
+ r = dissected_image_load_verity_sig_partition(dissected_image, loop_device->fd, &verity);
+ if (r < 0)
+ return r;
+
+ r = dissected_image_decrypt(
+ dissected_image,
+ NULL,
+ &verity,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decrypt dissected image: %m");
+
+ if (dest) {
+ r = mkdir_p_label(dest, 0755);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create destination directory %s: %m", dest);
+ r = umount_recursive(dest, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to umount under destination directory %s: %m", dest);
+ }
+
+ r = dissected_image_mount(
+ dissected_image,
+ dest,
+ /* uid_shift= */ UID_INVALID,
+ /* uid_range= */ UID_INVALID,
+ /* userns_fd= */ -EBADF,
+ dissect_image_flags);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to mount image: %m");
+
+ r = loop_device_flock(loop_device, LOCK_UN);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to unlock loopback device: %m");
+
+ /* If we got os-release values from the caller, then we need to match them with the image's
+ * extension-release.d/ content. Return -EINVAL if there's any mismatch.
+ * First, check the distro ID. If that matches, then check the new SYSEXT_LEVEL value if
+ * available, or else fallback to VERSION_ID. If neither is present (eg: rolling release),
+ * then a simple match on the ID will be performed. */
+ if (required_host_os_release_id) {
+ _cleanup_strv_free_ char **extension_release = NULL;
+ ImageClass class = IMAGE_SYSEXT;
+
+ assert(!isempty(required_host_os_release_id));
+
+ r = load_extension_release_pairs(dest, IMAGE_SYSEXT, dissected_image->image_name, relax_extension_release_check, &extension_release);
+ if (r == -ENOENT) {
+ r = load_extension_release_pairs(dest, IMAGE_CONFEXT, dissected_image->image_name, relax_extension_release_check, &extension_release);
+ if (r >= 0)
+ class = IMAGE_CONFEXT;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse image %s extension-release metadata: %m", dissected_image->image_name);
+
+ r = extension_release_validate(
+ dissected_image->image_name,
+ required_host_os_release_id,
+ required_host_os_release_version_id,
+ class == IMAGE_SYSEXT ? required_host_os_release_sysext_level : required_host_os_release_confext_level,
+ required_sysext_scope,
+ extension_release,
+ class);
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "Image %s extension-release metadata does not match the root's", dissected_image->image_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to compare image %s extension-release metadata with the root's os-release: %m", dissected_image->image_name);
+ }
+
+ r = dissected_image_relinquish(dissected_image);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to relinquish dissected image: %m");
+
+ if (ret_image)
+ *ret_image = TAKE_PTR(dissected_image);
+
+ return 0;
+}
diff --git a/src/shared/dissect-image.h b/src/shared/dissect-image.h
new file mode 100644
index 0000000..15c0bf7
--- /dev/null
+++ b/src/shared/dissect-image.h
@@ -0,0 +1,230 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-id128.h"
+
+#include "architecture.h"
+#include "env-util.h"
+#include "gpt.h"
+#include "list.h"
+#include "loop-util.h"
+#include "macro.h"
+#include "os-util.h"
+#include "strv.h"
+
+typedef struct DissectedImage DissectedImage;
+typedef struct DissectedPartition DissectedPartition;
+typedef struct DecryptedImage DecryptedImage;
+typedef struct MountOptions MountOptions;
+typedef struct VeritySettings VeritySettings;
+
+struct DissectedPartition {
+ bool found:1;
+ bool ignored:1;
+ bool rw:1;
+ bool growfs:1;
+ int partno; /* -1 if there was no partition and the images contains a file system directly */
+ Architecture architecture; /* Intended architecture: either native, secondary or unset ARCHITECTURE_INVALID. */
+ sd_id128_t uuid; /* Partition entry UUID as reported by the GPT */
+ char *fstype;
+ char *node;
+ char *label;
+ char *decrypted_node;
+ char *decrypted_fstype;
+ char *mount_options;
+ int mount_node_fd;
+ uint64_t size;
+ uint64_t offset;
+ uint64_t gpt_flags;
+ int fsmount_fd;
+};
+
+#define DISSECTED_PARTITION_NULL \
+ ((DissectedPartition) { \
+ .partno = -1, \
+ .architecture = _ARCHITECTURE_INVALID, \
+ .mount_node_fd = -EBADF, \
+ .fsmount_fd = -EBADF, \
+ })
+#define TAKE_PARTITION(p) \
+ ({ \
+ DissectedPartition *_pp = &(p), _p = *_pp; \
+ *_pp = DISSECTED_PARTITION_NULL; \
+ _p; \
+ })
+
+typedef enum DissectImageFlags {
+ DISSECT_IMAGE_DEVICE_READ_ONLY = 1 << 0, /* Make device read-only */
+ DISSECT_IMAGE_DISCARD_ON_LOOP = 1 << 1, /* Turn on "discard" if on a loop device and file system supports it */
+ DISSECT_IMAGE_DISCARD = 1 << 2, /* Turn on "discard" if file system supports it, on all block devices */
+ DISSECT_IMAGE_DISCARD_ON_CRYPTO = 1 << 3, /* Turn on "discard" also on crypto devices */
+ DISSECT_IMAGE_DISCARD_ANY = DISSECT_IMAGE_DISCARD_ON_LOOP |
+ DISSECT_IMAGE_DISCARD |
+ DISSECT_IMAGE_DISCARD_ON_CRYPTO,
+ DISSECT_IMAGE_GPT_ONLY = 1 << 4, /* Only recognize images with GPT partition tables */
+ DISSECT_IMAGE_GENERIC_ROOT = 1 << 5, /* If no partition table or only single generic partition, assume it's the root fs */
+ DISSECT_IMAGE_MOUNT_ROOT_ONLY = 1 << 6, /* Mount only the root and /usr partitions */
+ DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */
+ DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */
+ DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */
+ DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
+ DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
+ DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */
+ DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */
+ DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */
+ DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
+ DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
+ DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */
+ DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY |
+ DISSECT_IMAGE_MOUNT_READ_ONLY,
+ DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
+ DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
+ DISSECT_IMAGE_ADD_PARTITION_DEVICES = 1 << 20, /* Create partition devices via BLKPG_ADD_PARTITION */
+ DISSECT_IMAGE_PIN_PARTITION_DEVICES = 1 << 21, /* Open dissected partitions and decrypted partitions and pin them by fd */
+ DISSECT_IMAGE_RELAX_EXTENSION_CHECK = 1 << 22, /* Don't insist that the extension-release file name matches the image name */
+ DISSECT_IMAGE_DISKSEQ_DEVNODE = 1 << 23, /* Prefer /dev/disk/by-diskseq/… device nodes */
+ DISSECT_IMAGE_ALLOW_EMPTY = 1 << 24, /* Allow that no usable partitions is present */
+ DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE = 1 << 25, /* Try to mount the image beneath the specified mountpoint, rather than on top of it, and then umount the top */
+} DissectImageFlags;
+
+struct DissectedImage {
+ bool encrypted:1;
+ bool has_verity:1; /* verity available in image, but not necessarily used */
+ bool has_verity_sig:1; /* pkcs#7 signature embedded in image */
+ bool verity_ready:1; /* verity available, fully specified and usable */
+ bool verity_sig_ready:1; /* verity signature logic, fully specified and usable */
+ bool single_file_system:1; /* MBR/GPT or single file system */
+
+ LoopDevice *loop;
+ DissectedPartition partitions[_PARTITION_DESIGNATOR_MAX];
+ DecryptedImage *decrypted_image;
+
+ uint32_t sector_size;
+
+ char *image_name;
+ sd_id128_t image_uuid;
+
+ /* Meta information extracted from /etc/os-release and similar */
+ char *hostname;
+ sd_id128_t machine_id;
+ char **machine_info;
+ char **os_release;
+ char **initrd_release;
+ char **confext_release;
+ char **sysext_release;
+ int has_init_system;
+};
+
+struct MountOptions {
+ PartitionDesignator partition_designator;
+ char *options;
+ LIST_FIELDS(MountOptions, mount_options);
+};
+
+struct VeritySettings {
+ /* Binary root hash for the Verity Merkle tree */
+ void *root_hash;
+ size_t root_hash_size;
+
+ /* PKCS#7 signature of the above */
+ void *root_hash_sig;
+ size_t root_hash_sig_size;
+
+ /* Path to the verity data file, if stored externally */
+ char *data_path;
+
+ /* PARTITION_ROOT or PARTITION_USR, depending on what these Verity settings are for */
+ PartitionDesignator designator;
+};
+
+#define VERITY_SETTINGS_DEFAULT { \
+ .designator = _PARTITION_DESIGNATOR_INVALID \
+ }
+
+/* We include image-policy.h down here, since ImagePolicy wants a complete definition of PartitionDesignator first. */
+#include "image-policy.h"
+
+MountOptions* mount_options_free_all(MountOptions *options);
+DEFINE_TRIVIAL_CLEANUP_FUNC(MountOptions*, mount_options_free_all);
+const char* mount_options_from_designator(const MountOptions *options, PartitionDesignator designator);
+
+int probe_filesystem_full(int fd, const char *path, uint64_t offset, uint64_t size, char **ret_fstype);
+static inline int probe_filesystem(const char *path, char **ret_fstype) {
+ return probe_filesystem_full(-1, path, 0, UINT64_MAX, ret_fstype);
+}
+
+int dissect_log_error(int log_level, int r, const char *name, const VeritySettings *verity);
+int dissect_image_file(const char *path, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret);
+int dissect_image_file_and_warn(const char *path, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret);
+int dissect_loop_device(LoopDevice *loop, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret);
+int dissect_loop_device_and_warn(LoopDevice *loop, const VeritySettings *verity, const MountOptions *mount_options, const ImagePolicy *image_policy, DissectImageFlags flags, DissectedImage **ret);
+
+DissectedImage* dissected_image_unref(DissectedImage *m);
+DEFINE_TRIVIAL_CLEANUP_FUNC(DissectedImage*, dissected_image_unref);
+
+int dissected_image_decrypt(DissectedImage *m, const char *passphrase, const VeritySettings *verity, DissectImageFlags flags);
+int dissected_image_decrypt_interactively(DissectedImage *m, const char *passphrase, const VeritySettings *verity, DissectImageFlags flags);
+int dissected_image_mount(DissectedImage *m, const char *dest, uid_t uid_shift, uid_t uid_range, int userns_fd, DissectImageFlags flags);
+int dissected_image_mount_and_warn(DissectedImage *m, const char *where, uid_t uid_shift, uid_t uid_range, int userns_fd, DissectImageFlags flags);
+
+int dissected_image_acquire_metadata(DissectedImage *m, DissectImageFlags extra_flags);
+
+Architecture dissected_image_architecture(DissectedImage *m);
+
+static inline bool dissected_image_is_bootable_os(DissectedImage *m) {
+ return m && m->has_init_system > 0;
+}
+
+static inline bool dissected_image_is_bootable_uefi(DissectedImage *m) {
+ return m && m->partitions[PARTITION_ESP].found && dissected_image_is_bootable_os(m);
+}
+
+static inline bool dissected_image_is_portable(DissectedImage *m) {
+ return m && strv_env_pairs_get(m->os_release, "PORTABLE_PREFIXES");
+}
+
+static inline bool dissected_image_is_initrd(DissectedImage *m) {
+ return m && !strv_isempty(m->initrd_release);
+}
+
+DecryptedImage* decrypted_image_ref(DecryptedImage *p);
+DecryptedImage* decrypted_image_unref(DecryptedImage *p);
+DEFINE_TRIVIAL_CLEANUP_FUNC(DecryptedImage*, decrypted_image_unref);
+
+int dissected_image_relinquish(DissectedImage *m);
+
+int verity_settings_load(VeritySettings *verity, const char *image, const char *root_hash_path, const char *root_hash_sig_path);
+void verity_settings_done(VeritySettings *verity);
+
+static inline bool verity_settings_data_covers(const VeritySettings *verity, PartitionDesignator d) {
+ /* Returns true if the verity settings contain sufficient information to cover the specified partition */
+ return verity &&
+ ((d >= 0 && verity->designator == d) || (d == PARTITION_ROOT && verity->designator < 0)) &&
+ verity->root_hash &&
+ verity->data_path;
+}
+
+int dissected_image_load_verity_sig_partition(DissectedImage *m, int fd, VeritySettings *verity);
+
+bool dissected_image_verity_candidate(const DissectedImage *image, PartitionDesignator d);
+bool dissected_image_verity_ready(const DissectedImage *image, PartitionDesignator d);
+bool dissected_image_verity_sig_ready(const DissectedImage *image, PartitionDesignator d);
+
+int mount_image_privately_interactively(const char *path, const ImagePolicy *image_policy, DissectImageFlags flags, char **ret_directory, int *ret_dir_fd, LoopDevice **ret_loop_device);
+
+int verity_dissect_and_mount(int src_fd, const char *src, const char *dest, const MountOptions *options, const ImagePolicy *image_policy, const char *required_host_os_release_id, const char *required_host_os_release_version_id, const char *required_host_os_release_sysext_level, const char *required_host_os_release_confext_level, const char *required_sysext_scope, DissectedImage **ret_image);
+
+int dissect_fstype_ok(const char *fstype);
+
+int probe_sector_size(int fd, uint32_t *ret);
+int probe_sector_size_prefer_ioctl(int fd, uint32_t *ret);
+
+int partition_pick_mount_options(PartitionDesignator d, const char *fstype, bool rw, bool discard, char **ret_options, unsigned long *ret_ms_flags);
+
+static inline const char *dissected_partition_fstype(const DissectedPartition *m) {
+ assert(m);
+
+ return m->decrypted_node ? m->decrypted_fstype : m->fstype;
+}
diff --git a/src/shared/dlfcn-util.c b/src/shared/dlfcn-util.c
new file mode 100644
index 0000000..a321df3
--- /dev/null
+++ b/src/shared/dlfcn-util.c
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dlfcn-util.h"
+
+static int dlsym_many_or_warnv(void *dl, int log_level, va_list ap) {
+ void (**fn)(void);
+
+ /* Tries to resolve a bunch of function symbols, and logs an error about if it cannot resolve one of
+ * them. Note that this function possibly modifies the supplied function pointers if the whole
+ * operation fails. */
+
+ while ((fn = va_arg(ap, typeof(fn)))) {
+ void (*tfn)(void);
+ const char *symbol;
+
+ symbol = va_arg(ap, typeof(symbol));
+
+ tfn = (typeof(tfn)) dlsym(dl, symbol);
+ if (!tfn)
+ return log_full_errno(log_level,
+ SYNTHETIC_ERRNO(ELIBBAD),
+ "Can't find symbol %s: %s", symbol, dlerror());
+ *fn = tfn;
+ }
+
+ return 0;
+}
+
+int dlsym_many_or_warn_sentinel(void *dl, int log_level, ...) {
+ va_list ap;
+ int r;
+
+ va_start(ap, log_level);
+ r = dlsym_many_or_warnv(dl, log_level, ap);
+ va_end(ap);
+
+ return r;
+}
+
+int dlopen_many_sym_or_warn_sentinel(void **dlp, const char *filename, int log_level, ...) {
+ _cleanup_(dlclosep) void *dl = NULL;
+ int r;
+
+ if (*dlp)
+ return 0; /* Already loaded */
+
+ dl = dlopen(filename, RTLD_LAZY);
+ if (!dl)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "%s is not installed: %s", filename, dlerror());
+
+ va_list ap;
+ va_start(ap, log_level);
+ r = dlsym_many_or_warnv(dl, log_level, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return r;
+
+ /* Note that we never release the reference here, because there's no real reason to. After all this
+ * was traditionally a regular shared library dependency which lives forever too. */
+ *dlp = TAKE_PTR(dl);
+ return 1;
+}
diff --git a/src/shared/dlfcn-util.h b/src/shared/dlfcn-util.h
new file mode 100644
index 0000000..7d8cb4c
--- /dev/null
+++ b/src/shared/dlfcn-util.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <dlfcn.h>
+
+#include "macro.h"
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(void*, dlclose, NULL);
+
+int dlsym_many_or_warn_sentinel(void *dl, int log_level, ...) _sentinel_;
+int dlopen_many_sym_or_warn_sentinel(void **dlp, const char *filename, int log_level, ...) _sentinel_;
+
+#define dlsym_many_or_warn(dl, log_level, ...) \
+ dlsym_many_or_warn_sentinel(dl, log_level, __VA_ARGS__, NULL)
+#define dlopen_many_sym_or_warn(dlp, filename, log_level, ...) \
+ dlopen_many_sym_or_warn_sentinel(dlp, filename, log_level, __VA_ARGS__, NULL)
+
+#define DLSYM_PROTOTYPE(symbol) \
+ extern typeof(symbol)* sym_##symbol
+#define DLSYM_FUNCTION(symbol) \
+ typeof(symbol)* sym_##symbol = NULL
+
+/* Macro useful for putting together variable/symbol name pairs when calling dlsym_many_or_warn(). Assumes
+ * that each library symbol to resolve will be placed in a variable with the "sym_" prefix, i.e. a symbol
+ * "foobar" is loaded into a variable "sym_foobar". */
+#define DLSYM_ARG(arg) \
+ ({ assert_cc(__builtin_types_compatible_p(typeof(sym_##arg), typeof(&arg))); &sym_##arg; }), STRINGIFY(arg)
+
+/* libbpf is a bit confused about type-safety and API compatibility. Provide a macro that can tape over that mess. Sad. */
+#define DLSYM_ARG_FORCE(arg) \
+ &sym_##arg, STRINGIFY(arg)
+
+static inline void *safe_dlclose(void *p) {
+ if (!p)
+ return NULL;
+
+ assert_se(dlclose(p) == 0);
+ return NULL;
+}
diff --git a/src/shared/dm-util.c b/src/shared/dm-util.c
new file mode 100644
index 0000000..66c1e13
--- /dev/null
+++ b/src/shared/dm-util.c
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <linux/dm-ioctl.h>
+#include <sys/ioctl.h>
+
+#include "dm-util.h"
+#include "fd-util.h"
+#include "string-util.h"
+
+int dm_deferred_remove_cancel(const char *name) {
+ _cleanup_close_ int fd = -EBADF;
+ struct message {
+ struct dm_ioctl dm_ioctl;
+ struct dm_target_msg dm_target_msg;
+ char msg_text[STRLEN("@cancel_deferred_remove") + 1];
+ } _packed_ message = {
+ .dm_ioctl = {
+ .version = {
+ DM_VERSION_MAJOR,
+ DM_VERSION_MINOR,
+ DM_VERSION_PATCHLEVEL
+ },
+ .data_size = sizeof(struct message),
+ .data_start = sizeof(struct dm_ioctl),
+ },
+ .msg_text = "@cancel_deferred_remove",
+ };
+
+ assert(name);
+
+ if (strlen(name) >= sizeof(message.dm_ioctl.name))
+ return -ENODEV; /* A device with a name longer than this cannot possibly exist */
+
+ strncpy_exact(message.dm_ioctl.name, name, sizeof(message.dm_ioctl.name));
+
+ fd = open("/dev/mapper/control", O_RDWR|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ if (ioctl(fd, DM_TARGET_MSG, &message))
+ return -errno;
+
+ return 0;
+}
diff --git a/src/shared/dm-util.h b/src/shared/dm-util.h
new file mode 100644
index 0000000..e6e3d7d
--- /dev/null
+++ b/src/shared/dm-util.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int dm_deferred_remove_cancel(const char *name);
diff --git a/src/shared/dns-domain.c b/src/shared/dns-domain.c
new file mode 100644
index 0000000..b41c9b0
--- /dev/null
+++ b/src/shared/dns-domain.c
@@ -0,0 +1,1421 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <endian.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+
+#include "alloc-util.h"
+#include "dns-domain.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "hexdecoct.h"
+#include "hostname-util.h"
+#include "idn-util.h"
+#include "in-addr-util.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "utf8.h"
+
+int dns_label_unescape(const char **name, char *dest, size_t sz, DNSLabelFlags flags) {
+ const char *n;
+ char *d, last_char = 0;
+ int r = 0;
+
+ assert(name);
+ assert(*name);
+
+ n = *name;
+ d = dest;
+
+ for (;;) {
+ if (IN_SET(*n, 0, '.')) {
+ if (FLAGS_SET(flags, DNS_LABEL_LDH) && last_char == '-')
+ /* Trailing dash */
+ return -EINVAL;
+
+ if (n[0] == '.' && (n[1] != 0 || !FLAGS_SET(flags, DNS_LABEL_LEAVE_TRAILING_DOT)))
+ n++;
+
+ break;
+ }
+
+ if (r >= DNS_LABEL_MAX)
+ return -EINVAL;
+
+ if (sz <= 0)
+ return -ENOBUFS;
+
+ if (*n == '\\') {
+ /* Escaped character */
+ if (FLAGS_SET(flags, DNS_LABEL_NO_ESCAPES))
+ return -EINVAL;
+
+ n++;
+
+ if (*n == 0)
+ /* Ending NUL */
+ return -EINVAL;
+
+ else if (IN_SET(*n, '\\', '.')) {
+ /* Escaped backslash or dot */
+
+ if (FLAGS_SET(flags, DNS_LABEL_LDH))
+ return -EINVAL;
+
+ last_char = *n;
+ if (d)
+ *(d++) = *n;
+ sz--;
+ r++;
+ n++;
+
+ } else if (n[0] >= '0' && n[0] <= '9') {
+ unsigned k;
+
+ /* Escaped literal ASCII character */
+
+ if (!(n[1] >= '0' && n[1] <= '9') ||
+ !(n[2] >= '0' && n[2] <= '9'))
+ return -EINVAL;
+
+ k = ((unsigned) (n[0] - '0') * 100) +
+ ((unsigned) (n[1] - '0') * 10) +
+ ((unsigned) (n[2] - '0'));
+
+ /* Don't allow anything that doesn't fit in 8 bits. Note that we do allow
+ * control characters, as some servers (e.g. cloudflare) are happy to
+ * generate labels with them inside. */
+ if (k > 255)
+ return -EINVAL;
+
+ if (FLAGS_SET(flags, DNS_LABEL_LDH) &&
+ !valid_ldh_char((char) k))
+ return -EINVAL;
+
+ last_char = (char) k;
+ if (d)
+ *(d++) = (char) k;
+ sz--;
+ r++;
+
+ n += 3;
+ } else
+ return -EINVAL;
+
+ } else if ((uint8_t) *n >= (uint8_t) ' ' && *n != 127) {
+
+ /* Normal character */
+
+ if (FLAGS_SET(flags, DNS_LABEL_LDH)) {
+ if (!valid_ldh_char(*n))
+ return -EINVAL;
+ if (r == 0 && *n == '-')
+ /* Leading dash */
+ return -EINVAL;
+ }
+
+ last_char = *n;
+ if (d)
+ *(d++) = *n;
+ sz--;
+ r++;
+ n++;
+ } else
+ return -EINVAL;
+ }
+
+ /* Empty label that is not at the end? */
+ if (r == 0 && *n)
+ return -EINVAL;
+
+ /* More than one trailing dot? */
+ if (n[0] == '.' && !FLAGS_SET(flags, DNS_LABEL_LEAVE_TRAILING_DOT))
+ return -EINVAL;
+
+ if (sz >= 1 && d)
+ *d = 0;
+
+ *name = n;
+ return r;
+}
+
+/* @label_terminal: terminal character of a label, updated to point to the terminal character of
+ * the previous label (always skipping one dot) or to NULL if there are no more
+ * labels. */
+int dns_label_unescape_suffix(const char *name, const char **label_terminal, char *dest, size_t sz) {
+ const char *terminal;
+ int r;
+
+ assert(name);
+ assert(label_terminal);
+ assert(dest);
+
+ /* no more labels */
+ if (!*label_terminal) {
+ if (sz >= 1)
+ *dest = 0;
+
+ return 0;
+ }
+
+ terminal = *label_terminal;
+ assert(IN_SET(*terminal, 0, '.'));
+
+ /* Skip current terminal character (and accept domain names ending it ".") */
+ if (*terminal == 0)
+ terminal = PTR_SUB1(terminal, name);
+ if (terminal >= name && *terminal == '.')
+ terminal = PTR_SUB1(terminal, name);
+
+ /* Point name to the last label, and terminal to the preceding terminal symbol (or make it a NULL pointer) */
+ while (terminal) {
+ /* Find the start of the last label */
+ if (*terminal == '.') {
+ const char *y;
+ unsigned slashes = 0;
+
+ for (y = PTR_SUB1(terminal, name); y && *y == '\\'; y = PTR_SUB1(y, name))
+ slashes++;
+
+ if (slashes % 2 == 0) {
+ /* The '.' was not escaped */
+ name = terminal + 1;
+ break;
+ } else {
+ terminal = y;
+ continue;
+ }
+ }
+
+ terminal = PTR_SUB1(terminal, name);
+ }
+
+ r = dns_label_unescape(&name, dest, sz, 0);
+ if (r < 0)
+ return r;
+
+ *label_terminal = terminal;
+
+ return r;
+}
+
+int dns_label_escape(const char *p, size_t l, char *dest, size_t sz) {
+ char *q;
+
+ /* DNS labels must be between 1 and 63 characters long. A
+ * zero-length label does not exist. See RFC 2181, Section
+ * 11. */
+
+ if (l <= 0 || l > DNS_LABEL_MAX)
+ return -EINVAL;
+ if (sz < 1)
+ return -ENOBUFS;
+
+ assert(p);
+ assert(dest);
+
+ q = dest;
+ while (l > 0) {
+
+ if (IN_SET(*p, '.', '\\')) {
+
+ /* Dot or backslash */
+
+ if (sz < 3)
+ return -ENOBUFS;
+
+ *(q++) = '\\';
+ *(q++) = *p;
+
+ sz -= 2;
+
+ } else if (IN_SET(*p, '_', '-') ||
+ ascii_isdigit(*p) ||
+ ascii_isalpha(*p)) {
+
+ /* Proper character */
+
+ if (sz < 2)
+ return -ENOBUFS;
+
+ *(q++) = *p;
+ sz -= 1;
+
+ } else {
+
+ /* Everything else */
+
+ if (sz < 5)
+ return -ENOBUFS;
+
+ *(q++) = '\\';
+ *(q++) = '0' + (char) ((uint8_t) *p / 100);
+ *(q++) = '0' + (char) (((uint8_t) *p / 10) % 10);
+ *(q++) = '0' + (char) ((uint8_t) *p % 10);
+
+ sz -= 4;
+ }
+
+ p++;
+ l--;
+ }
+
+ *q = 0;
+ return (int) (q - dest);
+}
+
+int dns_label_escape_new(const char *p, size_t l, char **ret) {
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(p);
+ assert(ret);
+
+ if (l <= 0 || l > DNS_LABEL_MAX)
+ return -EINVAL;
+
+ s = new(char, DNS_LABEL_ESCAPED_MAX);
+ if (!s)
+ return -ENOMEM;
+
+ r = dns_label_escape(p, l, s, DNS_LABEL_ESCAPED_MAX);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(s);
+
+ return r;
+}
+
+#if HAVE_LIBIDN
+int dns_label_apply_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max) {
+ _cleanup_free_ uint32_t *input = NULL;
+ size_t input_size, l;
+ bool contains_8_bit = false;
+ char buffer[DNS_LABEL_MAX+1];
+ int r;
+
+ assert(encoded);
+ assert(decoded);
+
+ /* Converts a U-label into an A-label */
+
+ r = dlopen_idn();
+ if (r < 0)
+ return r;
+
+ if (encoded_size <= 0)
+ return -EINVAL;
+
+ for (const char *p = encoded; p < encoded + encoded_size; p++)
+ if ((uint8_t) *p > 127)
+ contains_8_bit = true;
+
+ if (!contains_8_bit) {
+ if (encoded_size > DNS_LABEL_MAX)
+ return -EINVAL;
+
+ return 0;
+ }
+
+ input = sym_stringprep_utf8_to_ucs4(encoded, encoded_size, &input_size);
+ if (!input)
+ return -ENOMEM;
+
+ if (sym_idna_to_ascii_4i(input, input_size, buffer, 0) != 0)
+ return -EINVAL;
+
+ l = strlen(buffer);
+
+ /* Verify that the result is not longer than one DNS label. */
+ if (l <= 0 || l > DNS_LABEL_MAX)
+ return -EINVAL;
+ if (l > decoded_max)
+ return -ENOBUFS;
+
+ memcpy(decoded, buffer, l);
+
+ /* If there's room, append a trailing NUL byte, but only then */
+ if (decoded_max > l)
+ decoded[l] = 0;
+
+ return (int) l;
+}
+
+int dns_label_undo_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max) {
+ size_t input_size, output_size;
+ _cleanup_free_ uint32_t *input = NULL;
+ _cleanup_free_ char *result = NULL;
+ uint32_t *output = NULL;
+ size_t w;
+ int r;
+
+ /* To be invoked after unescaping. Converts an A-label into a U-label. */
+
+ assert(encoded);
+ assert(decoded);
+
+ r = dlopen_idn();
+ if (r < 0)
+ return r;
+
+ if (encoded_size <= 0 || encoded_size > DNS_LABEL_MAX)
+ return -EINVAL;
+
+ if (!memory_startswith(encoded, encoded_size, IDNA_ACE_PREFIX))
+ return 0;
+
+ input = sym_stringprep_utf8_to_ucs4(encoded, encoded_size, &input_size);
+ if (!input)
+ return -ENOMEM;
+
+ output_size = input_size;
+ output = newa(uint32_t, output_size);
+
+ sym_idna_to_unicode_44i(input, input_size, output, &output_size, 0);
+
+ result = sym_stringprep_ucs4_to_utf8(output, output_size, NULL, &w);
+ if (!result)
+ return -ENOMEM;
+ if (w <= 0)
+ return -EINVAL;
+ if (w > decoded_max)
+ return -ENOBUFS;
+
+ memcpy(decoded, result, w);
+
+ /* Append trailing NUL byte if there's space, but only then. */
+ if (decoded_max > w)
+ decoded[w] = 0;
+
+ return w;
+}
+#endif
+
+int dns_name_concat(const char *a, const char *b, DNSLabelFlags flags, char **_ret) {
+ _cleanup_free_ char *ret = NULL;
+ size_t n = 0;
+ const char *p;
+ bool first = true;
+ int r;
+
+ if (a)
+ p = a;
+ else if (b)
+ p = TAKE_PTR(b);
+ else
+ goto finish;
+
+ for (;;) {
+ char label[DNS_LABEL_MAX];
+
+ r = dns_label_unescape(&p, label, sizeof label, flags);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ if (*p != 0)
+ return -EINVAL;
+
+ if (b) {
+ /* Now continue with the second string, if there is one */
+ p = TAKE_PTR(b);
+ continue;
+ }
+
+ break;
+ }
+
+ if (_ret) {
+ if (!GREEDY_REALLOC(ret, n + !first + DNS_LABEL_ESCAPED_MAX))
+ return -ENOMEM;
+
+ r = dns_label_escape(label, r, ret + n + !first, DNS_LABEL_ESCAPED_MAX);
+ if (r < 0)
+ return r;
+
+ if (!first)
+ ret[n] = '.';
+ } else {
+ char escaped[DNS_LABEL_ESCAPED_MAX];
+
+ r = dns_label_escape(label, r, escaped, sizeof(escaped));
+ if (r < 0)
+ return r;
+ }
+
+ n += r + !first;
+ first = false;
+ }
+
+finish:
+ if (n > DNS_HOSTNAME_MAX)
+ return -EINVAL;
+
+ if (_ret) {
+ if (n == 0) {
+ /* Nothing appended? If so, generate at least a single dot, to indicate the DNS root domain */
+ if (!GREEDY_REALLOC(ret, 2))
+ return -ENOMEM;
+
+ ret[n++] = '.';
+ } else {
+ if (!GREEDY_REALLOC(ret, n + 1))
+ return -ENOMEM;
+ }
+
+ ret[n] = 0;
+ *_ret = TAKE_PTR(ret);
+ }
+
+ return 0;
+}
+
+void dns_name_hash_func(const char *p, struct siphash *state) {
+ int r;
+
+ assert(p);
+
+ for (;;) {
+ char label[DNS_LABEL_MAX+1];
+
+ r = dns_label_unescape(&p, label, sizeof label, 0);
+ if (r < 0)
+ break;
+ if (r == 0)
+ break;
+
+ ascii_strlower_n(label, r);
+ siphash24_compress(label, r, state);
+ siphash24_compress_byte(0, state); /* make sure foobar and foo.bar result in different hashes */
+ }
+
+ /* enforce that all names are terminated by the empty label */
+ string_hash_func("", state);
+}
+
+int dns_name_compare_func(const char *a, const char *b) {
+ const char *x, *y;
+ int r, q;
+
+ assert(a);
+ assert(b);
+
+ x = a + strlen(a);
+ y = b + strlen(b);
+
+ for (;;) {
+ char la[DNS_LABEL_MAX], lb[DNS_LABEL_MAX];
+
+ if (x == NULL && y == NULL)
+ return 0;
+
+ r = dns_label_unescape_suffix(a, &x, la, sizeof(la));
+ q = dns_label_unescape_suffix(b, &y, lb, sizeof(lb));
+ if (r < 0 || q < 0)
+ return CMP(r, q);
+
+ r = ascii_strcasecmp_nn(la, r, lb, q);
+ if (r != 0)
+ return r;
+ }
+}
+
+DEFINE_HASH_OPS(
+ dns_name_hash_ops,
+ char,
+ dns_name_hash_func,
+ dns_name_compare_func);
+
+DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(
+ dns_name_hash_ops_free,
+ char,
+ dns_name_hash_func,
+ dns_name_compare_func,
+ free);
+
+int dns_name_equal(const char *x, const char *y) {
+ int r, q;
+
+ assert(x);
+ assert(y);
+
+ for (;;) {
+ char la[DNS_LABEL_MAX], lb[DNS_LABEL_MAX];
+
+ r = dns_label_unescape(&x, la, sizeof la, 0);
+ if (r < 0)
+ return r;
+
+ q = dns_label_unescape(&y, lb, sizeof lb, 0);
+ if (q < 0)
+ return q;
+
+ if (r != q)
+ return false;
+ if (r == 0)
+ return true;
+
+ if (ascii_strcasecmp_n(la, lb, r) != 0)
+ return false;
+ }
+}
+
+int dns_name_endswith(const char *name, const char *suffix) {
+ const char *n, *s, *saved_n = NULL;
+ int r, q;
+
+ assert(name);
+ assert(suffix);
+
+ n = name;
+ s = suffix;
+
+ for (;;) {
+ char ln[DNS_LABEL_MAX], ls[DNS_LABEL_MAX];
+
+ r = dns_label_unescape(&n, ln, sizeof ln, 0);
+ if (r < 0)
+ return r;
+
+ if (!saved_n)
+ saved_n = n;
+
+ q = dns_label_unescape(&s, ls, sizeof ls, 0);
+ if (q < 0)
+ return q;
+
+ if (r == 0 && q == 0)
+ return true;
+ if (r == 0 && saved_n == n)
+ return false;
+
+ if (r != q || ascii_strcasecmp_n(ln, ls, r) != 0) {
+
+ /* Not the same, let's jump back, and try with the next label again */
+ s = suffix;
+ n = TAKE_PTR(saved_n);
+ }
+ }
+}
+
+int dns_name_startswith(const char *name, const char *prefix) {
+ const char *n, *p;
+ int r, q;
+
+ assert(name);
+ assert(prefix);
+
+ n = name;
+ p = prefix;
+
+ for (;;) {
+ char ln[DNS_LABEL_MAX], lp[DNS_LABEL_MAX];
+
+ r = dns_label_unescape(&p, lp, sizeof lp, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return true;
+
+ q = dns_label_unescape(&n, ln, sizeof ln, 0);
+ if (q < 0)
+ return q;
+
+ if (r != q)
+ return false;
+ if (ascii_strcasecmp_n(ln, lp, r) != 0)
+ return false;
+ }
+}
+
+int dns_name_change_suffix(const char *name, const char *old_suffix, const char *new_suffix, char **ret) {
+ const char *n, *s, *saved_before = NULL, *saved_after = NULL, *prefix;
+ int r, q;
+
+ assert(name);
+ assert(old_suffix);
+ assert(new_suffix);
+ assert(ret);
+
+ n = name;
+ s = old_suffix;
+
+ for (;;) {
+ char ln[DNS_LABEL_MAX], ls[DNS_LABEL_MAX];
+
+ if (!saved_before)
+ saved_before = n;
+
+ r = dns_label_unescape(&n, ln, sizeof ln, 0);
+ if (r < 0)
+ return r;
+
+ if (!saved_after)
+ saved_after = n;
+
+ q = dns_label_unescape(&s, ls, sizeof ls, 0);
+ if (q < 0)
+ return q;
+
+ if (r == 0 && q == 0)
+ break;
+ if (r == 0 && saved_after == n) {
+ *ret = NULL; /* doesn't match */
+ return 0;
+ }
+
+ if (r != q || ascii_strcasecmp_n(ln, ls, r) != 0) {
+
+ /* Not the same, let's jump back, and try with the next label again */
+ s = old_suffix;
+ n = TAKE_PTR(saved_after);
+ saved_before = NULL;
+ }
+ }
+
+ /* Found it! Now generate the new name */
+ prefix = strndupa_safe(name, saved_before - name);
+
+ r = dns_name_concat(prefix, new_suffix, 0, ret);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int dns_name_between(const char *a, const char *b, const char *c) {
+ /* Determine if b is strictly greater than a and strictly smaller than c.
+ We consider the order of names to be circular, so that if a is
+ strictly greater than c, we consider b to be between them if it is
+ either greater than a or smaller than c. This is how the canonical
+ DNS name order used in NSEC records work. */
+
+ if (dns_name_compare_func(a, c) < 0)
+ /*
+ a and c are properly ordered:
+ a<---b--->c
+ */
+ return dns_name_compare_func(a, b) < 0 &&
+ dns_name_compare_func(b, c) < 0;
+ else
+ /*
+ a and c are equal or 'reversed':
+ <--b--c a----->
+ or:
+ <-----c a--b-->
+ */
+ return dns_name_compare_func(b, c) < 0 ||
+ dns_name_compare_func(a, b) < 0;
+}
+
+int dns_name_reverse(int family, const union in_addr_union *a, char **ret) {
+ const uint8_t *p;
+ int r;
+
+ assert(a);
+ assert(ret);
+
+ p = (const uint8_t*) a;
+
+ if (family == AF_INET)
+ r = asprintf(ret, "%u.%u.%u.%u.in-addr.arpa", p[3], p[2], p[1], p[0]);
+ else if (family == AF_INET6)
+ r = asprintf(ret, "%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.%c.ip6.arpa",
+ hexchar(p[15] & 0xF), hexchar(p[15] >> 4), hexchar(p[14] & 0xF), hexchar(p[14] >> 4),
+ hexchar(p[13] & 0xF), hexchar(p[13] >> 4), hexchar(p[12] & 0xF), hexchar(p[12] >> 4),
+ hexchar(p[11] & 0xF), hexchar(p[11] >> 4), hexchar(p[10] & 0xF), hexchar(p[10] >> 4),
+ hexchar(p[ 9] & 0xF), hexchar(p[ 9] >> 4), hexchar(p[ 8] & 0xF), hexchar(p[ 8] >> 4),
+ hexchar(p[ 7] & 0xF), hexchar(p[ 7] >> 4), hexchar(p[ 6] & 0xF), hexchar(p[ 6] >> 4),
+ hexchar(p[ 5] & 0xF), hexchar(p[ 5] >> 4), hexchar(p[ 4] & 0xF), hexchar(p[ 4] >> 4),
+ hexchar(p[ 3] & 0xF), hexchar(p[ 3] >> 4), hexchar(p[ 2] & 0xF), hexchar(p[ 2] >> 4),
+ hexchar(p[ 1] & 0xF), hexchar(p[ 1] >> 4), hexchar(p[ 0] & 0xF), hexchar(p[ 0] >> 4));
+ else
+ return -EAFNOSUPPORT;
+ if (r < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int dns_name_address(const char *p, int *ret_family, union in_addr_union *ret_address) {
+ int r;
+
+ assert(p);
+ assert(ret_family);
+ assert(ret_address);
+
+ r = dns_name_endswith(p, "in-addr.arpa");
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ uint8_t a[4];
+
+ for (size_t i = 0; i < ELEMENTSOF(a); i++) {
+ char label[DNS_LABEL_MAX+1];
+
+ r = dns_label_unescape(&p, label, sizeof label, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+ if (r > 3)
+ return -EINVAL;
+
+ r = safe_atou8(label, &a[i]);
+ if (r < 0)
+ return r;
+ }
+
+ r = dns_name_equal(p, "in-addr.arpa");
+ if (r <= 0)
+ return r;
+
+ *ret_family = AF_INET;
+ ret_address->in.s_addr = htobe32(((uint32_t) a[3] << 24) |
+ ((uint32_t) a[2] << 16) |
+ ((uint32_t) a[1] << 8) |
+ (uint32_t) a[0]);
+
+ return 1;
+ }
+
+ r = dns_name_endswith(p, "ip6.arpa");
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ struct in6_addr a;
+
+ for (size_t i = 0; i < ELEMENTSOF(a.s6_addr); i++) {
+ char label[DNS_LABEL_MAX+1];
+ int x, y;
+
+ r = dns_label_unescape(&p, label, sizeof label, 0);
+ if (r <= 0)
+ return r;
+ if (r != 1)
+ return -EINVAL;
+ x = unhexchar(label[0]);
+ if (x < 0)
+ return -EINVAL;
+
+ r = dns_label_unescape(&p, label, sizeof label, 0);
+ if (r <= 0)
+ return r;
+ if (r != 1)
+ return -EINVAL;
+ y = unhexchar(label[0]);
+ if (y < 0)
+ return -EINVAL;
+
+ a.s6_addr[ELEMENTSOF(a.s6_addr) - i - 1] = (uint8_t) y << 4 | (uint8_t) x;
+ }
+
+ r = dns_name_equal(p, "ip6.arpa");
+ if (r <= 0)
+ return r;
+
+ *ret_family = AF_INET6;
+ ret_address->in6 = a;
+ return 1;
+ }
+
+ *ret_family = AF_UNSPEC;
+ *ret_address = IN_ADDR_NULL;
+
+ return 0;
+}
+
+bool dns_name_is_root(const char *name) {
+ assert(name);
+
+ /* There are exactly two ways to encode the root domain name:
+ * as empty string, or with a single dot. */
+
+ return STR_IN_SET(name, "", ".");
+}
+
+bool dns_name_is_single_label(const char *name) {
+ int r;
+
+ assert(name);
+
+ r = dns_name_parent(&name);
+ if (r <= 0)
+ return false;
+
+ return dns_name_is_root(name);
+}
+
+/* Encode a domain name according to RFC 1035 Section 3.1, without compression */
+int dns_name_to_wire_format(const char *domain, uint8_t *buffer, size_t len, bool canonical) {
+ uint8_t *label_length, *out;
+ int r;
+
+ assert(domain);
+ assert(buffer);
+
+ out = buffer;
+
+ do {
+ /* Reserve a byte for label length */
+ if (len <= 0)
+ return -ENOBUFS;
+ len--;
+ label_length = out;
+ out++;
+
+ /* Convert and copy a single label. Note that
+ * dns_label_unescape() returns 0 when it hits the end
+ * of the domain name, which we rely on here to encode
+ * the trailing NUL byte. */
+ r = dns_label_unescape(&domain, (char *) out, len, 0);
+ if (r < 0)
+ return r;
+
+ /* Optionally, output the name in DNSSEC canonical
+ * format, as described in RFC 4034, section 6.2. Or
+ * in other words: in lower-case. */
+ if (canonical)
+ ascii_strlower_n((char*) out, (size_t) r);
+
+ /* Fill label length, move forward */
+ *label_length = r;
+ out += r;
+ len -= r;
+
+ } while (r != 0);
+
+ /* Verify the maximum size of the encoded name. The trailing
+ * dot + NUL byte account are included this time, hence
+ * compare against DNS_HOSTNAME_MAX + 2 (which is 255) this
+ * time. */
+ if (out - buffer > DNS_HOSTNAME_MAX + 2)
+ return -EINVAL;
+
+ return out - buffer;
+}
+
+static bool srv_type_label_is_valid(const char *label, size_t n) {
+ assert(label);
+
+ if (n < 2) /* Label needs to be at least 2 chars long */
+ return false;
+
+ if (label[0] != '_') /* First label char needs to be underscore */
+ return false;
+
+ /* Second char must be a letter */
+ if (!ascii_isalpha(label[1]))
+ return false;
+
+ /* Third and further chars must be alphanumeric or a hyphen */
+ for (size_t k = 2; k < n; k++)
+ if (!ascii_isalpha(label[k]) &&
+ !ascii_isdigit(label[k]) &&
+ label[k] != '-')
+ return false;
+
+ return true;
+}
+
+bool dns_srv_type_is_valid(const char *name) {
+ unsigned c = 0;
+ int r;
+
+ if (!name)
+ return false;
+
+ for (;;) {
+ char label[DNS_LABEL_MAX];
+
+ /* This more or less implements RFC 6335, Section 5.1 */
+
+ r = dns_label_unescape(&name, label, sizeof label, 0);
+ if (r < 0)
+ return false;
+ if (r == 0)
+ break;
+
+ if (c >= 2)
+ return false;
+
+ if (!srv_type_label_is_valid(label, r))
+ return false;
+
+ c++;
+ }
+
+ return c == 2; /* exactly two labels */
+}
+
+bool dnssd_srv_type_is_valid(const char *name) {
+ return dns_srv_type_is_valid(name) &&
+ ((dns_name_endswith(name, "_tcp") > 0) ||
+ (dns_name_endswith(name, "_udp") > 0)); /* Specific to DNS-SD. RFC 6763, Section 7 */
+}
+
+bool dns_service_name_is_valid(const char *name) {
+ size_t l;
+
+ /* This more or less implements RFC 6763, Section 4.1.1 */
+
+ if (!name)
+ return false;
+
+ if (!utf8_is_valid(name))
+ return false;
+
+ if (string_has_cc(name, NULL))
+ return false;
+
+ l = strlen(name);
+ if (l <= 0)
+ return false;
+ if (l > DNS_LABEL_MAX)
+ return false;
+
+ return true;
+}
+
+int dns_service_join(const char *name, const char *type, const char *domain, char **ret) {
+ char escaped[DNS_LABEL_ESCAPED_MAX];
+ _cleanup_free_ char *n = NULL;
+ int r;
+
+ assert(type);
+ assert(domain);
+ assert(ret);
+
+ if (!dns_srv_type_is_valid(type))
+ return -EINVAL;
+
+ if (!name)
+ return dns_name_concat(type, domain, 0, ret);
+
+ if (!dns_service_name_is_valid(name))
+ return -EINVAL;
+
+ r = dns_label_escape(name, strlen(name), escaped, sizeof(escaped));
+ if (r < 0)
+ return r;
+
+ r = dns_name_concat(type, domain, 0, &n);
+ if (r < 0)
+ return r;
+
+ return dns_name_concat(escaped, n, 0, ret);
+}
+
+static bool dns_service_name_label_is_valid(const char *label, size_t n) {
+ char *s;
+
+ assert(label);
+
+ if (memchr(label, 0, n))
+ return false;
+
+ s = strndupa_safe(label, n);
+ return dns_service_name_is_valid(s);
+}
+
+int dns_service_split(const char *joined, char **ret_name, char **ret_type, char **ret_domain) {
+ _cleanup_free_ char *name = NULL, *type = NULL, *domain = NULL;
+ const char *p = joined, *q = NULL, *d = joined;
+ char a[DNS_LABEL_MAX+1], b[DNS_LABEL_MAX+1], c[DNS_LABEL_MAX+1];
+ int an, bn, cn, r;
+ unsigned x = 0;
+
+ assert(joined);
+
+ /* Get first label from the full name */
+ an = dns_label_unescape(&p, a, sizeof(a), 0);
+ if (an < 0)
+ return an;
+
+ if (an > 0) {
+ x++;
+
+ /* If there was a first label, try to get the second one */
+ bn = dns_label_unescape(&p, b, sizeof(b), 0);
+ if (bn < 0)
+ return bn;
+
+ if (bn > 0) {
+ if (!srv_type_label_is_valid(b, bn))
+ goto finish;
+
+ x++;
+
+ /* If there was a second label, try to get the third one */
+ q = p;
+ cn = dns_label_unescape(&p, c, sizeof(c), 0);
+ if (cn < 0)
+ return cn;
+
+ if (cn > 0 && srv_type_label_is_valid(c, cn))
+ x++;
+ }
+ }
+
+ switch (x) {
+ case 2:
+ if (!srv_type_label_is_valid(a, an))
+ break;
+
+ /* OK, got <type> . <type2> . <domain> */
+
+ name = NULL;
+
+ type = strjoin(a, ".", b);
+ if (!type)
+ return -ENOMEM;
+
+ d = q;
+ break;
+
+ case 3:
+ if (!dns_service_name_label_is_valid(a, an))
+ break;
+
+ /* OK, got <name> . <type> . <type2> . <domain> */
+
+ name = strndup(a, an);
+ if (!name)
+ return -ENOMEM;
+
+ type = strjoin(b, ".", c);
+ if (!type)
+ return -ENOMEM;
+
+ d = p;
+ break;
+ }
+
+finish:
+ r = dns_name_normalize(d, 0, &domain);
+ if (r < 0)
+ return r;
+
+ if (ret_domain)
+ *ret_domain = TAKE_PTR(domain);
+
+ if (ret_type)
+ *ret_type = TAKE_PTR(type);
+
+ if (ret_name)
+ *ret_name = TAKE_PTR(name);
+
+ return 0;
+}
+
+static int dns_name_build_suffix_table(const char *name, const char *table[]) {
+ const char *p = ASSERT_PTR(name);
+ unsigned n = 0;
+ int r;
+
+ assert(table);
+
+ for (;;) {
+ if (n > DNS_N_LABELS_MAX)
+ return -EINVAL;
+
+ table[n] = p;
+ r = dns_name_parent(&p);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ n++;
+ }
+
+ return (int) n;
+}
+
+int dns_name_suffix(const char *name, unsigned n_labels, const char **ret) {
+ const char* labels[DNS_N_LABELS_MAX+1];
+ int n;
+
+ assert(name);
+ assert(ret);
+
+ n = dns_name_build_suffix_table(name, labels);
+ if (n < 0)
+ return n;
+
+ if ((unsigned) n < n_labels)
+ return -EINVAL;
+
+ *ret = labels[n - n_labels];
+ return (int) (n - n_labels);
+}
+
+int dns_name_skip(const char *a, unsigned n_labels, const char **ret) {
+ int r;
+
+ assert(a);
+ assert(ret);
+
+ for (; n_labels > 0; n_labels--) {
+ r = dns_name_parent(&a);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ *ret = "";
+ return 0;
+ }
+ }
+
+ *ret = a;
+ return 1;
+}
+
+int dns_name_count_labels(const char *name) {
+ unsigned n = 0;
+ int r;
+
+ assert(name);
+
+ for (const char *p = name;;) {
+ r = dns_name_parent(&p);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (n >= DNS_N_LABELS_MAX)
+ return -EINVAL;
+
+ n++;
+ }
+
+ return n;
+}
+
+int dns_name_equal_skip(const char *a, unsigned n_labels, const char *b) {
+ int r;
+
+ assert(a);
+ assert(b);
+
+ r = dns_name_skip(a, n_labels, &a);
+ if (r <= 0)
+ return r;
+
+ return dns_name_equal(a, b);
+}
+
+int dns_name_common_suffix(const char *a, const char *b, const char **ret) {
+ const char *a_labels[DNS_N_LABELS_MAX+1], *b_labels[DNS_N_LABELS_MAX+1];
+ int n = 0, m = 0, k = 0, r, q;
+
+ assert(a);
+ assert(b);
+ assert(ret);
+
+ /* Determines the common suffix of domain names a and b */
+
+ n = dns_name_build_suffix_table(a, a_labels);
+ if (n < 0)
+ return n;
+
+ m = dns_name_build_suffix_table(b, b_labels);
+ if (m < 0)
+ return m;
+
+ for (;;) {
+ char la[DNS_LABEL_MAX], lb[DNS_LABEL_MAX];
+ const char *x, *y;
+
+ if (k >= n || k >= m) {
+ *ret = a_labels[n - k];
+ return 0;
+ }
+
+ x = a_labels[n - 1 - k];
+ r = dns_label_unescape(&x, la, sizeof la, 0);
+ if (r < 0)
+ return r;
+
+ y = b_labels[m - 1 - k];
+ q = dns_label_unescape(&y, lb, sizeof lb, 0);
+ if (q < 0)
+ return q;
+
+ if (r != q || ascii_strcasecmp_n(la, lb, r) != 0) {
+ *ret = a_labels[n - k];
+ return 0;
+ }
+
+ k++;
+ }
+}
+
+int dns_name_apply_idna(const char *name, char **ret) {
+
+ /* Return negative on error, 0 if not implemented, positive on success. */
+
+#if HAVE_LIBIDN2 || HAVE_LIBIDN2
+ int r;
+
+ r = dlopen_idn();
+ if (r == -EOPNOTSUPP) {
+ *ret = NULL;
+ return 0;
+ }
+ if (r < 0)
+ return r;
+#endif
+
+#if HAVE_LIBIDN2
+ _cleanup_free_ char *t = NULL;
+
+ assert(name);
+ assert(ret);
+
+ /* First, try non-transitional mode (i.e. IDN2008 rules) */
+ r = sym_idn2_lookup_u8((uint8_t*) name, (uint8_t**) &t,
+ IDN2_NFC_INPUT | IDN2_NONTRANSITIONAL);
+ if (r == IDN2_DISALLOWED) /* If that failed, because of disallowed characters, try transitional mode.
+ * (i.e. IDN2003 rules which supports some unicode chars IDN2008 doesn't allow). */
+ r = sym_idn2_lookup_u8((uint8_t*) name, (uint8_t**) &t,
+ IDN2_NFC_INPUT | IDN2_TRANSITIONAL);
+
+ log_debug("idn2_lookup_u8: %s %s %s", name, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), t);
+ if (r == IDN2_OK) {
+ if (!startswith(name, "xn--")) {
+ _cleanup_free_ char *s = NULL;
+
+ r = sym_idn2_to_unicode_8z8z(t, &s, 0);
+ if (r != IDN2_OK) {
+ log_debug("idn2_to_unicode_8z8z(\"%s\") failed: %d/%s",
+ t, r, sym_idn2_strerror(r));
+ *ret = NULL;
+ return 0;
+ }
+
+ if (!streq_ptr(name, s)) {
+ log_debug("idn2 roundtrip failed: \"%s\" %s \"%s\" %s \"%s\", ignoring.",
+ name, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), t,
+ special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), s);
+ *ret = NULL;
+ return 0;
+ }
+ }
+
+ *ret = TAKE_PTR(t);
+ return 1; /* *ret has been written */
+ }
+
+ log_debug("idn2_lookup_u8(\"%s\") failed: %d/%s", name, r, sym_idn2_strerror(r));
+ if (r == IDN2_2HYPHEN)
+ /* The name has two hyphens — forbidden by IDNA2008 in some cases */
+ return 0;
+ if (IN_SET(r, IDN2_TOO_BIG_DOMAIN, IDN2_TOO_BIG_LABEL))
+ return -ENOSPC;
+
+ return -EINVAL;
+#elif HAVE_LIBIDN
+ _cleanup_free_ char *buf = NULL;
+ size_t n = 0;
+ bool first = true;
+ int r, q;
+
+ assert(name);
+ assert(ret);
+
+ for (;;) {
+ char label[DNS_LABEL_MAX];
+
+ r = dns_label_unescape(&name, label, sizeof label, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ q = dns_label_apply_idna(label, r, label, sizeof label);
+ if (q < 0)
+ return q;
+ if (q > 0)
+ r = q;
+
+ if (!GREEDY_REALLOC(buf, n + !first + DNS_LABEL_ESCAPED_MAX))
+ return -ENOMEM;
+
+ r = dns_label_escape(label, r, buf + n + !first, DNS_LABEL_ESCAPED_MAX);
+ if (r < 0)
+ return r;
+
+ if (first)
+ first = false;
+ else
+ buf[n++] = '.';
+
+ n += r;
+ }
+
+ if (n > DNS_HOSTNAME_MAX)
+ return -EINVAL;
+
+ if (!GREEDY_REALLOC(buf, n + 1))
+ return -ENOMEM;
+
+ buf[n] = 0;
+ *ret = TAKE_PTR(buf);
+
+ return 1;
+#else
+ *ret = NULL;
+ return 0;
+#endif
+}
+
+int dns_name_is_valid_or_address(const char *name) {
+ /* Returns > 0 if the specified name is either a valid IP address formatted as string or a valid DNS name */
+
+ if (isempty(name))
+ return 0;
+
+ if (in_addr_from_string_auto(name, NULL, NULL) >= 0)
+ return 1;
+
+ return dns_name_is_valid(name);
+}
+
+int dns_name_dot_suffixed(const char *name) {
+ const char *p = name;
+ int r;
+
+ for (;;) {
+ if (streq(p, "."))
+ return true;
+
+ r = dns_label_unescape(&p, NULL, DNS_LABEL_MAX, DNS_LABEL_LEAVE_TRAILING_DOT);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return false;
+ }
+}
+
+bool dns_name_dont_resolve(const char *name) {
+
+ /* Never respond to some of the domains listed in RFC6303 */
+ if (dns_name_endswith(name, "0.in-addr.arpa") > 0 ||
+ dns_name_equal(name, "255.255.255.255.in-addr.arpa") > 0 ||
+ dns_name_equal(name, "0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa") > 0)
+ return true;
+
+ /* Never respond to some of the domains listed in RFC6761 */
+ if (dns_name_endswith(name, "invalid") > 0)
+ return true;
+
+ /* Never respond to some of the domains listed in RFC9476 */
+ if (dns_name_endswith(name, "alt") > 0)
+ return true;
+
+ return false;
+}
diff --git a/src/shared/dns-domain.h b/src/shared/dns-domain.h
new file mode 100644
index 0000000..331fb89
--- /dev/null
+++ b/src/shared/dns-domain.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "dns-def.h"
+#include "hashmap.h"
+#include "in-addr-util.h"
+
+typedef enum DNSLabelFlags {
+ DNS_LABEL_LDH = 1 << 0, /* Follow the "LDH" rule — only letters, digits, and internal hyphens. */
+ DNS_LABEL_NO_ESCAPES = 1 << 1, /* Do not treat backslashes specially */
+ DNS_LABEL_LEAVE_TRAILING_DOT = 1 << 2, /* Leave trailing dot in place */
+} DNSLabelFlags;
+
+int dns_label_unescape(const char **name, char *dest, size_t sz, DNSLabelFlags flags);
+int dns_label_unescape_suffix(const char *name, const char **label_end, char *dest, size_t sz);
+int dns_label_escape(const char *p, size_t l, char *dest, size_t sz);
+int dns_label_escape_new(const char *p, size_t l, char **ret);
+
+static inline int dns_name_parent(const char **name) {
+ return dns_label_unescape(name, NULL, DNS_LABEL_MAX, 0);
+}
+
+#if HAVE_LIBIDN
+int dns_label_apply_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max);
+int dns_label_undo_idna(const char *encoded, size_t encoded_size, char *decoded, size_t decoded_max);
+#endif
+
+int dns_name_concat(const char *a, const char *b, DNSLabelFlags flags, char **ret);
+
+static inline int dns_name_normalize(const char *s, DNSLabelFlags flags, char **ret) {
+ /* dns_name_concat() normalizes as a side-effect */
+ return dns_name_concat(s, NULL, flags, ret);
+}
+
+static inline int dns_name_is_valid(const char *s) {
+ int r;
+
+ /* dns_name_concat() verifies as a side effect */
+ r = dns_name_concat(s, NULL, 0, NULL);
+ if (r == -EINVAL)
+ return 0;
+ if (r < 0)
+ return r;
+ return 1;
+}
+
+static inline int dns_name_is_valid_ldh(const char *s) {
+ int r;
+
+ r = dns_name_concat(s, NULL, DNS_LABEL_LDH|DNS_LABEL_NO_ESCAPES, NULL);
+ if (r == -EINVAL)
+ return 0;
+ if (r < 0)
+ return r;
+ return 1;
+}
+
+void dns_name_hash_func(const char *s, struct siphash *state);
+int dns_name_compare_func(const char *a, const char *b);
+extern const struct hash_ops dns_name_hash_ops;
+extern const struct hash_ops dns_name_hash_ops_free;
+
+int dns_name_between(const char *a, const char *b, const char *c);
+int dns_name_equal(const char *x, const char *y);
+int dns_name_endswith(const char *name, const char *suffix);
+int dns_name_startswith(const char *name, const char *prefix);
+
+int dns_name_change_suffix(const char *name, const char *old_suffix, const char *new_suffix, char **ret);
+
+int dns_name_reverse(int family, const union in_addr_union *a, char **ret);
+int dns_name_address(const char *p, int *family, union in_addr_union *a);
+
+bool dns_name_is_root(const char *name);
+bool dns_name_is_single_label(const char *name);
+
+int dns_name_to_wire_format(const char *domain, uint8_t *buffer, size_t len, bool canonical);
+
+bool dns_srv_type_is_valid(const char *name);
+bool dnssd_srv_type_is_valid(const char *name);
+bool dns_service_name_is_valid(const char *name);
+
+int dns_service_join(const char *name, const char *type, const char *domain, char **ret);
+int dns_service_split(const char *joined, char **ret_name, char **ret_type, char **ret_domain);
+
+int dns_name_suffix(const char *name, unsigned n_labels, const char **ret);
+int dns_name_count_labels(const char *name);
+
+int dns_name_skip(const char *a, unsigned n_labels, const char **ret);
+int dns_name_equal_skip(const char *a, unsigned n_labels, const char *b);
+
+int dns_name_common_suffix(const char *a, const char *b, const char **ret);
+
+int dns_name_apply_idna(const char *name, char **ret);
+
+int dns_name_is_valid_or_address(const char *name);
+
+int dns_name_dot_suffixed(const char *name);
+
+bool dns_name_dont_resolve(const char *name);
diff --git a/src/shared/dropin.c b/src/shared/dropin.c
new file mode 100644
index 0000000..d46e838
--- /dev/null
+++ b/src/shared/dropin.c
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "conf-files.h"
+#include "dirent-util.h"
+#include "dropin.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio-label.h"
+#include "hashmap.h"
+#include "log.h"
+#include "macro.h"
+#include "mkdir.h"
+#include "path-util.h"
+#include "set.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-name.h"
+
+int drop_in_file(const char *dir, const char *unit, unsigned level,
+ const char *name, char **ret_p, char **ret_q) {
+
+ char prefix[DECIMAL_STR_MAX(unsigned)];
+ _cleanup_free_ char *b = NULL, *p = NULL, *q = NULL;
+
+ assert(unit);
+ assert(name);
+ assert(ret_p);
+ assert(ret_q);
+
+ sprintf(prefix, "%u", level);
+
+ b = xescape(name, "/.");
+ if (!b)
+ return -ENOMEM;
+
+ if (!filename_is_valid(b))
+ return -EINVAL;
+
+ p = strjoin(dir, "/", unit, ".d");
+ q = strjoin(p, "/", prefix, "-", b, ".conf");
+ if (!p || !q)
+ return -ENOMEM;
+
+ *ret_p = TAKE_PTR(p);
+ *ret_q = TAKE_PTR(q);
+ return 0;
+}
+
+int write_drop_in(const char *dir, const char *unit, unsigned level,
+ const char *name, const char *data) {
+
+ _cleanup_free_ char *p = NULL, *q = NULL;
+ int r;
+
+ assert(dir);
+ assert(unit);
+ assert(name);
+ assert(data);
+
+ r = drop_in_file(dir, unit, level, name, &p, &q);
+ if (r < 0)
+ return r;
+
+ (void) mkdir_p(p, 0755);
+ return write_string_file_atomic_label(q, data);
+}
+
+int write_drop_in_format(const char *dir, const char *unit, unsigned level,
+ const char *name, const char *format, ...) {
+ _cleanup_free_ char *p = NULL;
+ va_list ap;
+ int r;
+
+ assert(dir);
+ assert(unit);
+ assert(name);
+ assert(format);
+
+ va_start(ap, format);
+ r = vasprintf(&p, format, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return -ENOMEM;
+
+ return write_drop_in(dir, unit, level, name, p);
+}
+
+static int unit_file_add_dir(
+ const char *original_root,
+ const char *path,
+ char ***dirs) {
+
+ _cleanup_free_ char *chased = NULL;
+ int r;
+
+ assert(path);
+
+ /* This adds [original_root]/path to dirs, if it exists. */
+
+ r = chase(path, original_root, 0, &chased, NULL);
+ if (r == -ENOENT) /* Ignore -ENOENT, after all most units won't have a drop-in dir. */
+ return 0;
+ if (r == -ENAMETOOLONG) {
+ /* Also, ignore -ENAMETOOLONG but log about it. After all, users are not even able to create the
+ * drop-in dir in such case. This mostly happens for device units with an overly long /sys path. */
+ log_debug_errno(r, "Path '%s' too long, couldn't canonicalize, ignoring.", path);
+ return 0;
+ }
+ if (r < 0)
+ return log_warning_errno(r, "Failed to canonicalize path '%s': %m", path);
+
+ if (strv_consume(dirs, TAKE_PTR(chased)) < 0)
+ return log_oom();
+
+ return 0;
+}
+
+static int unit_file_find_dirs(
+ const char *original_root,
+ Set *unit_path_cache,
+ const char *unit_path,
+ const char *name,
+ const char *suffix,
+ char ***dirs) {
+
+ _cleanup_free_ char *prefix = NULL, *instance = NULL, *built = NULL;
+ bool is_instance, chopped;
+ const char *dash;
+ UnitType type;
+ char *path;
+ size_t n;
+ int r;
+
+ assert(unit_path);
+ assert(name);
+ assert(suffix);
+
+ path = strjoina(unit_path, "/", name, suffix);
+ if (!unit_path_cache || set_get(unit_path_cache, path)) {
+ r = unit_file_add_dir(original_root, path, dirs);
+ if (r < 0)
+ return r;
+ }
+
+ is_instance = unit_name_is_valid(name, UNIT_NAME_INSTANCE);
+ if (is_instance) { /* Also try the template dir */
+ _cleanup_free_ char *template = NULL;
+
+ r = unit_name_template(name, &template);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate template from unit name: %m");
+
+ r = unit_file_find_dirs(original_root, unit_path_cache, unit_path, template, suffix, dirs);
+ if (r < 0)
+ return r;
+ }
+
+ /* Return early for top level drop-ins. */
+ if (unit_type_from_string(name) >= 0)
+ return 0;
+
+ /* Let's see if there's a "-" prefix for this unit name. If so, let's invoke ourselves for it. This will then
+ * recursively do the same for all our prefixes. i.e. this means given "foo-bar-waldo.service" we'll also
+ * search "foo-bar-.service" and "foo-.service".
+ *
+ * Note the order in which we do it: we traverse up adding drop-ins on each step. This means the more specific
+ * drop-ins may override the more generic drop-ins, which is the intended behaviour. */
+
+ r = unit_name_to_prefix(name, &prefix);
+ if (r < 0)
+ return log_error_errno(r, "Failed to derive unit name prefix from unit name: %m");
+
+ chopped = false;
+ for (;;) {
+ dash = strrchr(prefix, '-');
+ if (!dash) /* No dash? if so we are done */
+ return 0;
+
+ n = (size_t) (dash - prefix);
+ if (n == 0) /* Leading dash? If so, we are done */
+ return 0;
+
+ if (prefix[n+1] != 0 || chopped) {
+ prefix[n+1] = 0;
+ break;
+ }
+
+ /* Trailing dash? If so, chop it off and try again, but not more than once. */
+ prefix[n] = 0;
+ chopped = true;
+ }
+
+ if (!unit_prefix_is_valid(prefix))
+ return 0;
+
+ type = unit_name_to_type(name);
+ if (type < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to derive unit type from unit name: %s",
+ name);
+
+ if (is_instance) {
+ r = unit_name_to_instance(name, &instance);
+ if (r < 0)
+ return log_error_errno(r, "Failed to derive unit name instance from unit name: %m");
+ }
+
+ r = unit_name_build_from_type(prefix, instance, type, &built);
+ if (r < 0)
+ return log_error_errno(r, "Failed to build prefix unit name: %m");
+
+ return unit_file_find_dirs(original_root, unit_path_cache, unit_path, built, suffix, dirs);
+}
+
+int unit_file_find_dropin_paths(
+ const char *original_root,
+ char **lookup_path,
+ Set *unit_path_cache,
+ const char *dir_suffix,
+ const char *file_suffix,
+ const char *name,
+ const Set *aliases,
+ char ***ret) {
+
+ _cleanup_strv_free_ char **dirs = NULL;
+ const char *n;
+ int r;
+
+ assert(ret);
+
+ if (name)
+ STRV_FOREACH(p, lookup_path)
+ (void) unit_file_find_dirs(original_root, unit_path_cache, *p, name, dir_suffix, &dirs);
+
+ SET_FOREACH(n, aliases)
+ STRV_FOREACH(p, lookup_path)
+ (void) unit_file_find_dirs(original_root, unit_path_cache, *p, n, dir_suffix, &dirs);
+
+ /* All the names in the unit are of the same type so just grab one. */
+ n = name ?: (const char*) set_first(aliases);
+ if (n) {
+ UnitType type = _UNIT_TYPE_INVALID;
+
+ type = unit_name_to_type(n);
+ if (type < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to derive unit type from unit name: %s", n);
+
+ /* Special top level drop in for "<unit type>.<suffix>". Add this last as it's the most generic
+ * and should be able to be overridden by more specific drop-ins. */
+ STRV_FOREACH(p, lookup_path)
+ (void) unit_file_find_dirs(original_root,
+ unit_path_cache,
+ *p,
+ unit_type_to_string(type),
+ dir_suffix,
+ &dirs);
+ }
+
+ if (strv_isempty(dirs)) {
+ *ret = NULL;
+ return 0;
+ }
+
+ r = conf_files_list_strv(ret, file_suffix, NULL, 0, (const char**) dirs);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to create the list of configuration files: %m");
+
+ return 1;
+}
diff --git a/src/shared/dropin.h b/src/shared/dropin.h
new file mode 100644
index 0000000..54cceaf
--- /dev/null
+++ b/src/shared/dropin.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "hashmap.h"
+#include "macro.h"
+#include "set.h"
+#include "unit-name.h"
+
+int drop_in_file(const char *dir, const char *unit, unsigned level,
+ const char *name, char **_p, char **_q);
+
+int write_drop_in(const char *dir, const char *unit, unsigned level,
+ const char *name, const char *data);
+
+int write_drop_in_format(const char *dir, const char *unit, unsigned level,
+ const char *name, const char *format, ...) _printf_(5, 6);
+
+int unit_file_find_dropin_paths(
+ const char *original_root,
+ char **lookup_path,
+ Set *unit_path_cache,
+ const char *dir_suffix,
+ const char *file_suffix,
+ const char *name,
+ const Set *aliases,
+ char ***paths);
diff --git a/src/shared/edit-util.c b/src/shared/edit-util.c
new file mode 100644
index 0000000..045839b
--- /dev/null
+++ b/src/shared/edit-util.c
@@ -0,0 +1,370 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "copy.h"
+#include "edit-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util-label.h"
+
+void edit_file_context_done(EditFileContext *context) {
+ int r;
+
+ assert(context);
+
+ FOREACH_ARRAY(i, context->files, context->n_files) {
+ unlink_and_free(i->temp);
+
+ if (context->remove_parent) {
+ _cleanup_free_ char *parent = NULL;
+
+ r = path_extract_directory(i->path, &parent);
+ if (r < 0)
+ log_debug_errno(r, "Failed to extract directory from '%s', ignoring: %m", i->path);
+ else if (rmdir(parent) < 0 && !IN_SET(errno, ENOENT, ENOTEMPTY))
+ log_debug_errno(errno, "Failed to remove parent directory '%s', ignoring: %m", parent);
+ }
+
+ free(i->path);
+ free(i->original_path);
+ strv_free(i->comment_paths);
+ }
+
+ context->files = mfree(context->files);
+ context->n_files = 0;
+}
+
+bool edit_files_contains(const EditFileContext *context, const char *path) {
+ assert(context);
+ assert(path);
+
+ FOREACH_ARRAY(i, context->files, context->n_files)
+ if (path_equal(i->path, path))
+ return true;
+
+ return false;
+}
+
+int edit_files_add(
+ EditFileContext *context,
+ const char *path,
+ const char *original_path,
+ char * const *comment_paths) {
+
+ _cleanup_free_ char *new_path = NULL, *new_original_path = NULL;
+ _cleanup_strv_free_ char **new_comment_paths = NULL;
+
+ assert(context);
+ assert(path);
+
+ if (edit_files_contains(context, path))
+ return 0;
+
+ if (!GREEDY_REALLOC(context->files, context->n_files + 1))
+ return log_oom();
+
+ new_path = strdup(path);
+ if (!new_path)
+ return log_oom();
+
+ if (original_path) {
+ new_original_path = strdup(original_path);
+ if (!new_original_path)
+ return log_oom();
+ }
+
+ if (comment_paths) {
+ new_comment_paths = strv_copy(comment_paths);
+ if (!new_comment_paths)
+ return log_oom();
+ }
+
+ context->files[context->n_files] = (EditFile) {
+ .context = context,
+ .path = TAKE_PTR(new_path),
+ .original_path = TAKE_PTR(new_original_path),
+ .comment_paths = TAKE_PTR(new_comment_paths),
+ };
+ context->n_files++;
+
+ return 1;
+}
+
+static int create_edit_temp_file(EditFile *e) {
+ _cleanup_(unlink_and_freep) char *temp = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ const char *source;
+ bool has_original, has_target;
+ unsigned line = 1;
+ int r;
+
+ assert(e);
+ assert(e->context);
+ assert(e->path);
+ assert(!e->comment_paths || (e->context->marker_start && e->context->marker_end));
+
+ if (e->temp)
+ return 0;
+
+ r = mkdir_parents_label(e->path, 0755);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create parent directories for '%s': %m", e->path);
+
+ r = fopen_temporary_label(e->path, e->path, &f, &temp);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create temporary file for '%s': %m", e->path);
+
+ if (fchmod(fileno(f), 0644) < 0)
+ return log_error_errno(errno, "Failed to change mode of temporary file '%s': %m", temp);
+
+ has_original = e->original_path && access(e->original_path, F_OK) >= 0;
+ has_target = access(e->path, F_OK) >= 0;
+
+ if (has_original && (!has_target || e->context->overwrite_with_origin))
+ /* We are asked to overwrite target with original_path or target doesn't exist. */
+ source = e->original_path;
+ else if (has_target)
+ /* Target exists and shouldn't be overwritten. */
+ source = e->path;
+ else
+ source = NULL;
+
+ if (e->comment_paths) {
+ _cleanup_free_ char *source_contents = NULL;
+
+ if (source) {
+ r = read_full_file(source, &source_contents, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read source file '%s': %m", source);
+ }
+
+ fprintf(f,
+ "### Editing %s\n"
+ "%s\n"
+ "\n"
+ "%s%s"
+ "\n"
+ "%s\n",
+ e->path,
+ e->context->marker_start,
+ strempty(source_contents),
+ source_contents && endswith(source_contents, "\n") ? "" : "\n",
+ e->context->marker_end);
+
+ line = 4; /* Start editing at the contents area */
+
+ STRV_FOREACH(path, e->comment_paths) {
+ _cleanup_free_ char *comment = NULL;
+
+ /* Skip the file which is being edited and the source file (can be the same) */
+ if (PATH_IN_SET(*path, e->path, source))
+ continue;
+
+ r = read_full_file(*path, &comment, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read comment file '%s': %m", *path);
+
+ fprintf(f, "\n\n### %s", *path);
+
+ if (!isempty(comment)) {
+ _cleanup_free_ char *c = NULL;
+
+ c = strreplace(strstrip(comment), "\n", "\n# ");
+ if (!c)
+ return log_oom();
+
+ fprintf(f, "\n# %s", c);
+ }
+ }
+ } else if (source) {
+ r = copy_file_fd(source, fileno(f), COPY_REFLINK);
+ if (r < 0) {
+ assert(r != -ENOENT);
+ return log_error_errno(r, "Failed to copy file '%s' to temporary file '%s': %m", source, temp);
+ }
+ }
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write to temporary file '%s': %m", temp);
+
+ e->temp = TAKE_PTR(temp);
+ e->line = line;
+
+ return 0;
+}
+
+static int run_editor_child(const EditFileContext *context) {
+ _cleanup_strv_free_ char **args = NULL;
+ const char *editor;
+ int r;
+
+ /* SYSTEMD_EDITOR takes precedence over EDITOR which takes precedence over VISUAL.
+ * If neither SYSTEMD_EDITOR nor EDITOR nor VISUAL are present, we try to execute
+ * well known editors. */
+ editor = getenv("SYSTEMD_EDITOR");
+ if (!editor)
+ editor = getenv("EDITOR");
+ if (!editor)
+ editor = getenv("VISUAL");
+
+ if (!isempty(editor)) {
+ _cleanup_strv_free_ char **editor_args = NULL;
+
+ editor_args = strv_split(editor, WHITESPACE);
+ if (!editor_args)
+ return log_oom();
+
+ args = TAKE_PTR(editor_args);
+ }
+
+ if (context->n_files == 1 && context->files[0].line > 1) {
+ /* If editing a single file only, use the +LINE syntax to put cursor on the right line */
+ r = strv_extendf(&args, "+%u", context->files[0].line);
+ if (r < 0)
+ return log_oom();
+ }
+
+ FOREACH_ARRAY(i, context->files, context->n_files) {
+ r = strv_extend(&args, i->temp);
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (!isempty(editor))
+ execvp(args[0], (char* const*) args);
+
+ bool prepended = false;
+ FOREACH_STRING(name, "editor", "nano", "vim", "vi") {
+ if (!prepended) {
+ r = strv_prepend(&args, name);
+ prepended = true;
+ } else
+ r = free_and_strdup(&args[0], name);
+ if (r < 0)
+ return log_oom();
+
+ execvp(args[0], (char* const*) args);
+
+ /* We do not fail if the editor doesn't exist because we want to try each one of them
+ * before failing. */
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to execute '%s': %m", name);
+ }
+
+ return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+ "Cannot edit files, no editor available. Please set either $SYSTEMD_EDITOR, $EDITOR or $VISUAL.");
+}
+
+static int run_editor(const EditFileContext *context) {
+ int r;
+
+ assert(context);
+
+ r = safe_fork("(editor)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG|FORK_WAIT, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0) { /* Child */
+ r = run_editor_child(context);
+ _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
+ }
+
+ return 0;
+}
+
+static int strip_edit_temp_file(EditFile *e) {
+ _cleanup_free_ char *old_contents = NULL, *new_contents = NULL;
+ const char *stripped;
+ int r;
+
+ assert(e);
+ assert(e->context);
+ assert(e->temp);
+
+ r = read_full_file(e->temp, &old_contents, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read temporary file '%s': %m", e->temp);
+
+ if (e->context->marker_start) {
+ /* Trim out the lines between the two markers */
+ char *contents_start, *contents_end;
+
+ assert(e->context->marker_end);
+
+ contents_start = strstrafter(old_contents, e->context->marker_start);
+ if (!contents_start)
+ contents_start = old_contents;
+
+ contents_end = strstr(contents_start, e->context->marker_end);
+ if (contents_end)
+ *contents_end = '\0';
+
+ stripped = strstrip(contents_start);
+ } else
+ stripped = strstrip(old_contents);
+ if (isempty(stripped))
+ return 0; /* File is empty (has no real changes) */
+
+ /* Trim prefix and suffix, but ensure suffixed by single newline */
+ new_contents = strjoin(stripped, "\n");
+ if (!new_contents)
+ return log_oom();
+
+ if (streq(old_contents, new_contents)) /* Don't touch the file if the above didn't change a thing */
+ return 1; /* Contents unchanged after stripping but has changes */
+
+ r = write_string_file(e->temp, new_contents, WRITE_STRING_FILE_CREATE | WRITE_STRING_FILE_TRUNCATE | WRITE_STRING_FILE_AVOID_NEWLINE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to strip temporary file '%s': %m", e->temp);
+
+ return 1; /* Contents have real changes and are changed after stripping */
+}
+
+int do_edit_files_and_install(EditFileContext *context) {
+ int r;
+
+ assert(context);
+
+ if (context->n_files == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Got no files to edit.");
+
+ FOREACH_ARRAY(i, context->files, context->n_files) {
+ r = create_edit_temp_file(i);
+ if (r < 0)
+ return r;
+ }
+
+ r = run_editor(context);
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(i, context->files, context->n_files) {
+ /* Always call strip_edit_temp_file which will tell if the temp file has actual changes */
+ r = strip_edit_temp_file(i);
+ if (r < 0)
+ return r;
+ if (r == 0) /* temp file doesn't carry actual changes, ignoring */
+ continue;
+
+ r = RET_NERRNO(rename(i->temp, i->path));
+ if (r < 0)
+ return log_error_errno(r,
+ "Failed to rename temporary file '%s' to target file '%s': %m",
+ i->temp,
+ i->path);
+ i->temp = mfree(i->temp);
+
+ log_info("Successfully installed edited file '%s'.", i->path);
+ }
+
+ return 0;
+}
diff --git a/src/shared/edit-util.h b/src/shared/edit-util.h
new file mode 100644
index 0000000..83b3df8
--- /dev/null
+++ b/src/shared/edit-util.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#define DROPIN_MARKER_START "### Anything between here and the comment below will become the contents of the drop-in file"
+#define DROPIN_MARKER_END "### Edits below this comment will be discarded"
+
+typedef struct EditFile EditFile;
+typedef struct EditFileContext EditFileContext;
+
+struct EditFile {
+ EditFileContext *context;
+ char *path;
+ char *original_path;
+ char **comment_paths;
+ char *temp;
+ unsigned line;
+};
+
+struct EditFileContext {
+ EditFile *files;
+ size_t n_files;
+ const char *marker_start;
+ const char *marker_end;
+ bool remove_parent;
+ bool overwrite_with_origin; /* whether to always overwrite target with original file */
+};
+
+void edit_file_context_done(EditFileContext *context);
+
+bool edit_files_contains(const EditFileContext *context, const char *path);
+
+int edit_files_add(
+ EditFileContext *context,
+ const char *path,
+ const char *original_path,
+ char * const *comment_paths);
+
+int do_edit_files_and_install(EditFileContext *context);
diff --git a/src/shared/efi-api.c b/src/shared/efi-api.c
new file mode 100644
index 0000000..4cd1091
--- /dev/null
+++ b/src/shared/efi-api.c
@@ -0,0 +1,556 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "dirent-util.h"
+#include "efi-api.h"
+#include "efivars.h"
+#include "fd-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "utf8.h"
+
+#if ENABLE_EFI
+
+#define LOAD_OPTION_ACTIVE 0x00000001
+#define MEDIA_DEVICE_PATH 0x04
+#define MEDIA_HARDDRIVE_DP 0x01
+#define MEDIA_FILEPATH_DP 0x04
+#define SIGNATURE_TYPE_GUID 0x02
+#define MBR_TYPE_EFI_PARTITION_TABLE_HEADER 0x02
+#define END_DEVICE_PATH_TYPE 0x7f
+#define END_ENTIRE_DEVICE_PATH_SUBTYPE 0xff
+
+#define EFI_OS_INDICATIONS_BOOT_TO_FW_UI UINT64_C(0x0000000000000001)
+
+#define boot_option__contents \
+ { \
+ uint32_t attr; \
+ uint16_t path_len; \
+ uint16_t title[]; \
+ }
+
+struct boot_option boot_option__contents;
+struct boot_option__packed boot_option__contents _packed_;
+assert_cc(offsetof(struct boot_option, title) == offsetof(struct boot_option__packed, title));
+/* sizeof(struct boot_option) != sizeof(struct boot_option__packed), so
+ * the *size* of the structure should not be used anywhere below. */
+
+struct drive_path {
+ uint32_t part_nr;
+ uint64_t part_start;
+ uint64_t part_size;
+ char signature[16];
+ uint8_t mbr_type;
+ uint8_t signature_type;
+} _packed_;
+
+#define device_path__contents \
+ { \
+ uint8_t type; \
+ uint8_t sub_type; \
+ uint16_t length; \
+ union { \
+ uint16_t path[0]; \
+ struct drive_path drive; \
+ }; \
+ }
+
+struct device_path device_path__contents;
+struct device_path__packed device_path__contents _packed_;
+assert_cc(sizeof(struct device_path) == sizeof(struct device_path__packed));
+
+int efi_reboot_to_firmware_supported(void) {
+ _cleanup_free_ void *v = NULL;
+ static int cache = -1;
+ uint64_t b;
+ size_t s;
+ int r;
+
+ if (cache > 0)
+ return 0;
+ if (cache == 0)
+ return -EOPNOTSUPP;
+
+ if (!is_efi_boot())
+ goto not_supported;
+
+ r = efi_get_variable(EFI_GLOBAL_VARIABLE(OsIndicationsSupported), NULL, &v, &s);
+ if (r == -ENOENT)
+ goto not_supported; /* variable doesn't exist? it's not supported then */
+ if (r < 0)
+ return r;
+ if (s != sizeof(uint64_t))
+ return -EINVAL;
+
+ b = *(uint64_t*) v;
+ if (!(b & EFI_OS_INDICATIONS_BOOT_TO_FW_UI))
+ goto not_supported; /* bit unset? it's not supported then */
+
+ cache = 1;
+ return 0;
+
+not_supported:
+ cache = 0;
+ return -EOPNOTSUPP;
+}
+
+static int get_os_indications(uint64_t *ret) {
+ static struct stat cache_stat = {};
+ _cleanup_free_ void *v = NULL;
+ static uint64_t cache;
+ struct stat new_stat;
+ size_t s;
+ int r;
+
+ assert(ret);
+
+ /* Let's verify general support first */
+ r = efi_reboot_to_firmware_supported();
+ if (r < 0)
+ return r;
+
+ /* stat() the EFI variable, to see if the mtime changed. If it did we need to cache again. */
+ if (stat(EFIVAR_PATH(EFI_GLOBAL_VARIABLE(OsIndications)), &new_stat) < 0) {
+ if (errno != ENOENT)
+ return -errno;
+
+ /* Doesn't exist? Then we can exit early (also see below) */
+ *ret = 0;
+ return 0;
+
+ } else if (stat_inode_unmodified(&new_stat, &cache_stat)) {
+ /* inode didn't change, we can return the cached value */
+ *ret = cache;
+ return 0;
+ }
+
+ r = efi_get_variable(EFI_GLOBAL_VARIABLE(OsIndications), NULL, &v, &s);
+ if (r == -ENOENT) {
+ /* Some firmware implementations that do support OsIndications and report that with
+ * OsIndicationsSupported will remove the OsIndications variable when it is unset. Let's
+ * pretend it's 0 then, to hide this implementation detail. Note that this call will return
+ * -ENOENT then only if the support for OsIndications is missing entirely, as determined by
+ * efi_reboot_to_firmware_supported() above. */
+ *ret = 0;
+ return 0;
+ }
+ if (r < 0)
+ return r;
+ if (s != sizeof(uint64_t))
+ return -EINVAL;
+
+ cache_stat = new_stat;
+ *ret = cache = *(uint64_t *)v;
+ return 0;
+}
+
+int efi_get_reboot_to_firmware(void) {
+ int r;
+ uint64_t b;
+
+ r = get_os_indications(&b);
+ if (r < 0)
+ return r;
+
+ return !!(b & EFI_OS_INDICATIONS_BOOT_TO_FW_UI);
+}
+
+int efi_set_reboot_to_firmware(bool value) {
+ int r;
+ uint64_t b, b_new;
+
+ r = get_os_indications(&b);
+ if (r < 0)
+ return r;
+
+ b_new = UPDATE_FLAG(b, EFI_OS_INDICATIONS_BOOT_TO_FW_UI, value);
+
+ /* Avoid writing to efi vars store if we can due to firmware bugs. */
+ if (b != b_new)
+ return efi_set_variable(EFI_GLOBAL_VARIABLE(OsIndications), &b_new, sizeof(uint64_t));
+
+ return 0;
+}
+
+static ssize_t utf16_size(const uint16_t *s, size_t buf_len_bytes) {
+ size_t l = 0;
+
+ /* Returns the size of the string in bytes without the terminating two zero bytes */
+
+ while (l < buf_len_bytes / sizeof(uint16_t)) {
+ if (s[l] == 0)
+ return (l + 1) * sizeof(uint16_t);
+ l++;
+ }
+
+ return -EINVAL; /* The terminator was not found */
+}
+
+int efi_get_boot_option(
+ uint16_t id,
+ char **ret_title,
+ sd_id128_t *ret_part_uuid,
+ char **ret_path,
+ bool *ret_active) {
+
+ char variable[STRLEN(EFI_GLOBAL_VARIABLE_STR("Boot")) + 4 + 1];
+ _cleanup_free_ uint8_t *buf = NULL;
+ size_t l;
+ struct boot_option *header;
+ ssize_t title_size;
+ _cleanup_free_ char *s = NULL, *p = NULL;
+ sd_id128_t p_uuid = SD_ID128_NULL;
+ int r;
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ xsprintf(variable, EFI_GLOBAL_VARIABLE_STR("Boot%04X"), id);
+ r = efi_get_variable(variable, NULL, (void **)&buf, &l);
+ if (r < 0)
+ return r;
+ if (l < offsetof(struct boot_option, title))
+ return -ENOENT;
+
+ header = (struct boot_option *)buf;
+ title_size = utf16_size(header->title, l - offsetof(struct boot_option, title));
+ if (title_size < 0)
+ return title_size;
+
+ if (ret_title) {
+ s = utf16_to_utf8(header->title, title_size);
+ if (!s)
+ return -ENOMEM;
+ }
+
+ if (header->path_len > 0) {
+ uint8_t *dbuf;
+ size_t dnext, doff;
+
+ doff = offsetof(struct boot_option, title) + title_size;
+ dbuf = buf + doff;
+ if (header->path_len > l - doff)
+ return -EINVAL;
+
+ dnext = 0;
+ while (dnext < header->path_len) {
+ struct device_path *dpath;
+
+ dpath = (struct device_path *)(dbuf + dnext);
+ if (dpath->length < 4)
+ break;
+
+ /* Type 0x7F – End of Hardware Device Path, Sub-Type 0xFF – End Entire Device Path */
+ if (dpath->type == END_DEVICE_PATH_TYPE && dpath->sub_type == END_ENTIRE_DEVICE_PATH_SUBTYPE)
+ break;
+
+ dnext += dpath->length;
+
+ /* Type 0x04 – Media Device Path */
+ if (dpath->type != MEDIA_DEVICE_PATH)
+ continue;
+
+ /* Sub-Type 1 – Hard Drive */
+ if (dpath->sub_type == MEDIA_HARDDRIVE_DP) {
+ /* 0x02 – GUID Partition Table */
+ if (dpath->drive.mbr_type != MBR_TYPE_EFI_PARTITION_TABLE_HEADER)
+ continue;
+
+ /* 0x02 – GUID signature */
+ if (dpath->drive.signature_type != SIGNATURE_TYPE_GUID)
+ continue;
+
+ if (ret_part_uuid)
+ p_uuid = efi_guid_to_id128(dpath->drive.signature);
+ continue;
+ }
+
+ /* Sub-Type 4 – File Path */
+ if (dpath->sub_type == MEDIA_FILEPATH_DP && !p && ret_path) {
+ p = utf16_to_utf8(dpath->path, dpath->length-4);
+ if (!p)
+ return -ENOMEM;
+
+ efi_tilt_backslashes(p);
+ continue;
+ }
+ }
+ }
+
+ if (ret_title)
+ *ret_title = TAKE_PTR(s);
+ if (ret_part_uuid)
+ *ret_part_uuid = p_uuid;
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_active)
+ *ret_active = header->attr & LOAD_OPTION_ACTIVE;
+
+ return 0;
+}
+
+static void to_utf16(uint16_t *dest, const char *src) {
+ int i;
+
+ for (i = 0; src[i] != '\0'; i++)
+ dest[i] = src[i];
+ dest[i] = '\0';
+}
+
+static uint16_t *tilt_slashes(uint16_t *s) {
+ for (uint16_t *p = s; *p; p++)
+ if (*p == '/')
+ *p = '\\';
+
+ return s;
+}
+
+int efi_add_boot_option(
+ uint16_t id,
+ const char *title,
+ uint32_t part,
+ uint64_t pstart,
+ uint64_t psize,
+ sd_id128_t part_uuid,
+ const char *path) {
+
+ size_t size, title_len, path_len;
+ _cleanup_free_ char *buf = NULL;
+ struct boot_option *option;
+ struct device_path *devicep;
+ char variable[STRLEN(EFI_GLOBAL_VARIABLE_STR("Boot")) + 4 + 1];
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ title_len = (strlen(title)+1) * 2;
+ path_len = (strlen(path)+1) * 2;
+
+ buf = malloc0(offsetof(struct boot_option, title) + title_len +
+ sizeof(struct drive_path) +
+ sizeof(struct device_path) + path_len);
+ if (!buf)
+ return -ENOMEM;
+
+ /* header */
+ option = (struct boot_option *)buf;
+ option->attr = LOAD_OPTION_ACTIVE;
+ option->path_len = offsetof(struct device_path, drive) + sizeof(struct drive_path) +
+ offsetof(struct device_path, path) + path_len +
+ offsetof(struct device_path, path);
+ to_utf16(option->title, title);
+ size = offsetof(struct boot_option, title) + title_len;
+
+ /* partition info */
+ devicep = (struct device_path *)(buf + size);
+ devicep->type = MEDIA_DEVICE_PATH;
+ devicep->sub_type = MEDIA_HARDDRIVE_DP;
+ devicep->length = offsetof(struct device_path, drive) + sizeof(struct drive_path);
+ memcpy(&devicep->drive.part_nr, &part, sizeof(uint32_t));
+ memcpy(&devicep->drive.part_start, &pstart, sizeof(uint64_t));
+ memcpy(&devicep->drive.part_size, &psize, sizeof(uint64_t));
+ efi_id128_to_guid(part_uuid, devicep->drive.signature);
+ devicep->drive.mbr_type = MBR_TYPE_EFI_PARTITION_TABLE_HEADER;
+ devicep->drive.signature_type = SIGNATURE_TYPE_GUID;
+ size += devicep->length;
+
+ /* path to loader */
+ devicep = (struct device_path *)(buf + size);
+ devicep->type = MEDIA_DEVICE_PATH;
+ devicep->sub_type = MEDIA_FILEPATH_DP;
+ devicep->length = offsetof(struct device_path, path) + path_len;
+ to_utf16(devicep->path, path);
+ tilt_slashes(devicep->path);
+ size += devicep->length;
+
+ /* end of path */
+ devicep = (struct device_path *)(buf + size);
+ devicep->type = END_DEVICE_PATH_TYPE;
+ devicep->sub_type = END_ENTIRE_DEVICE_PATH_SUBTYPE;
+ devicep->length = offsetof(struct device_path, path);
+ size += devicep->length;
+
+ xsprintf(variable, EFI_GLOBAL_VARIABLE_STR("Boot%04X"), id);
+ return efi_set_variable(variable, buf, size);
+}
+
+int efi_remove_boot_option(uint16_t id) {
+ char variable[STRLEN(EFI_GLOBAL_VARIABLE_STR("Boot")) + 4 + 1];
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ xsprintf(variable, EFI_GLOBAL_VARIABLE_STR("Boot%04X"), id);
+ return efi_set_variable(variable, NULL, 0);
+}
+
+int efi_get_boot_order(uint16_t **ret_order) {
+ _cleanup_free_ void *buf = NULL;
+ size_t l;
+ int r;
+
+ assert(ret_order);
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ r = efi_get_variable(EFI_GLOBAL_VARIABLE(BootOrder), NULL, &buf, &l);
+ if (r < 0)
+ return r;
+
+ if (l <= 0)
+ return -ENOENT;
+
+ if (l % sizeof(uint16_t) > 0 ||
+ l / sizeof(uint16_t) > INT_MAX)
+ return -EINVAL;
+
+ *ret_order = TAKE_PTR(buf);
+ return (int) (l / sizeof(uint16_t));
+}
+
+int efi_set_boot_order(const uint16_t *order, size_t n) {
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ return efi_set_variable(EFI_GLOBAL_VARIABLE(BootOrder), order, n * sizeof(uint16_t));
+}
+
+static int boot_id_hex(const char s[static 4]) {
+ int id = 0;
+
+ assert(s);
+
+ for (int i = 0; i < 4; i++)
+ if (s[i] >= '0' && s[i] <= '9')
+ id |= (s[i] - '0') << (3 - i) * 4;
+ else if (s[i] >= 'A' && s[i] <= 'F')
+ id |= (s[i] - 'A' + 10) << (3 - i) * 4;
+ else
+ return -EINVAL;
+
+ return id;
+}
+
+int efi_get_boot_options(uint16_t **ret_options) {
+ _cleanup_closedir_ DIR *dir = NULL;
+ _cleanup_free_ uint16_t *list = NULL;
+ int count = 0;
+
+ assert(ret_options);
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ dir = opendir(EFIVAR_PATH("."));
+ if (!dir)
+ return -errno;
+
+ FOREACH_DIRENT(de, dir, return -errno) {
+ int id;
+
+ if (strncmp(de->d_name, "Boot", 4) != 0)
+ continue;
+
+ if (strlen(de->d_name) != 45)
+ continue;
+
+ if (strcmp(de->d_name + 8, EFI_GLOBAL_VARIABLE_STR("")) != 0) /* generate variable suffix using macro */
+ continue;
+
+ id = boot_id_hex(de->d_name + 4);
+ if (id < 0)
+ continue;
+
+ if (!GREEDY_REALLOC(list, count + 1))
+ return -ENOMEM;
+
+ list[count++] = id;
+ }
+
+ typesafe_qsort(list, count, cmp_uint16);
+
+ *ret_options = TAKE_PTR(list);
+
+ return count;
+}
+
+bool efi_has_tpm2(void) {
+ static int cache = -1;
+
+ /* Returns whether the system has a TPM2 chip which is known to the EFI firmware. */
+
+ if (cache >= 0)
+ return cache;
+
+ /* First, check if we are on an EFI boot at all. */
+ if (!is_efi_boot()) {
+ cache = 0;
+ return cache;
+ }
+
+ /* Then, check if the ACPI table "TPM2" exists, which is the TPM2 event log table, see:
+ * https://trustedcomputinggroup.org/wp-content/uploads/TCG_ACPIGeneralSpecification_v1.20_r8.pdf
+ * This table exists whenever the firmware is hooked up to TPM2. */
+ cache = access("/sys/firmware/acpi/tables/TPM2", F_OK) >= 0;
+ if (cache)
+ return cache;
+
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Unable to test whether /sys/firmware/acpi/tables/TPM2 exists, assuming it doesn't: %m");
+
+ /* As the last try, check if the EFI firmware provides the EFI_TCG2_FINAL_EVENTS_TABLE
+ * stored in EFI configuration table, see:
+ * https://trustedcomputinggroup.org/wp-content/uploads/EFI-Protocol-Specification-rev13-160330final.pdf
+ */
+ cache = access("/sys/kernel/security/tpm0/binary_bios_measurements", F_OK) >= 0;
+ if (!cache && errno != ENOENT)
+ log_debug_errno(errno, "Unable to test whether /sys/kernel/security/tpm0/binary_bios_measurements exists, assuming it doesn't: %m");
+
+ return cache;
+}
+
+#endif
+
+struct efi_guid {
+ uint32_t u1;
+ uint16_t u2;
+ uint16_t u3;
+ uint8_t u4[8];
+} _packed_;
+
+sd_id128_t efi_guid_to_id128(const void *guid) {
+ const struct efi_guid *uuid = ASSERT_PTR(guid); /* cast is safe, because struct efi_guid is packed */
+ sd_id128_t id128;
+
+ id128.bytes[0] = (uuid->u1 >> 24) & 0xff;
+ id128.bytes[1] = (uuid->u1 >> 16) & 0xff;
+ id128.bytes[2] = (uuid->u1 >> 8) & 0xff;
+ id128.bytes[3] = uuid->u1 & 0xff;
+
+ id128.bytes[4] = (uuid->u2 >> 8) & 0xff;
+ id128.bytes[5] = uuid->u2 & 0xff;
+
+ id128.bytes[6] = (uuid->u3 >> 8) & 0xff;
+ id128.bytes[7] = uuid->u3 & 0xff;
+
+ memcpy(&id128.bytes[8], uuid->u4, sizeof(uuid->u4));
+
+ return id128;
+}
+
+void efi_id128_to_guid(sd_id128_t id, void *ret_guid) {
+ assert(ret_guid);
+
+ struct efi_guid uuid = {
+ .u1 = id.bytes[0] << 24 | id.bytes[1] << 16 | id.bytes[2] << 8 | id.bytes[3],
+ .u2 = id.bytes[4] << 8 | id.bytes[5],
+ .u3 = id.bytes[6] << 8 | id.bytes[7],
+ };
+ memcpy(uuid.u4, id.bytes+8, sizeof(uuid.u4));
+ memcpy(ret_guid, &uuid, sizeof(uuid));
+}
diff --git a/src/shared/efi-api.h b/src/shared/efi-api.h
new file mode 100644
index 0000000..09071b2
--- /dev/null
+++ b/src/shared/efi-api.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "efivars-fundamental.h"
+#include "efivars.h"
+#include "string-util.h"
+
+/* Various calls for interfacing with EFI variables from the official UEFI specs. */
+
+#if ENABLE_EFI
+
+int efi_reboot_to_firmware_supported(void);
+int efi_get_reboot_to_firmware(void);
+int efi_set_reboot_to_firmware(bool value);
+
+int efi_get_boot_option(uint16_t nr, char **ret_title, sd_id128_t *ret_part_uuid, char **ret_path, bool *ret_active);
+int efi_add_boot_option(uint16_t id, const char *title, uint32_t part, uint64_t pstart, uint64_t psize, sd_id128_t part_uuid, const char *path);
+int efi_remove_boot_option(uint16_t id);
+int efi_get_boot_order(uint16_t **ret_order);
+int efi_set_boot_order(const uint16_t *order, size_t n);
+int efi_get_boot_options(uint16_t **ret_options);
+
+bool efi_has_tpm2(void);
+
+#else
+
+static inline int efi_reboot_to_firmware_supported(void) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_get_reboot_to_firmware(void) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_set_reboot_to_firmware(bool value) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_get_boot_option(uint16_t nr, char **ret_title, sd_id128_t *ret_part_uuid, char **ret_path, bool *ret_active) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_add_boot_option(uint16_t id, const char *title, uint32_t part, uint64_t pstart, uint64_t psize, sd_id128_t part_uuid, const char *path) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_remove_boot_option(uint16_t id) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_get_boot_order(uint16_t **ret_order) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_set_boot_order(const uint16_t *order, size_t n) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_get_boot_options(uint16_t **ret_options) {
+ return -EOPNOTSUPP;
+}
+
+static inline bool efi_has_tpm2(void) {
+ return false;
+}
+
+#endif
+
+static inline char *efi_tilt_backslashes(char *s) {
+ return string_replace_char(s, '\\', '/');
+}
+
+sd_id128_t efi_guid_to_id128(const void *guid);
+void efi_id128_to_guid(sd_id128_t id, void *ret_guid);
diff --git a/src/shared/efi-loader.c b/src/shared/efi-loader.c
new file mode 100644
index 0000000..7d6bda9
--- /dev/null
+++ b/src/shared/efi-loader.c
@@ -0,0 +1,363 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "efi-api.h"
+#include "efi-loader.h"
+#include "env-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "stat-util.h"
+#include "strv.h"
+#include "tpm2-pcr.h"
+#include "utf8.h"
+
+#if ENABLE_EFI
+
+static int read_usec(const char *variable, usec_t *ret) {
+ _cleanup_free_ char *j = NULL;
+ uint64_t x = 0;
+ int r;
+
+ assert(variable);
+ assert(ret);
+
+ r = efi_get_variable_string(variable, &j);
+ if (r < 0)
+ return r;
+
+ r = safe_atou64(j, &x);
+ if (r < 0)
+ return r;
+
+ *ret = x;
+ return 0;
+}
+
+int efi_loader_get_boot_usec(usec_t *ret_firmware, usec_t *ret_loader) {
+ uint64_t x, y;
+ int r;
+
+ assert(ret_firmware);
+ assert(ret_loader);
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ r = read_usec(EFI_LOADER_VARIABLE(LoaderTimeInitUSec), &x);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read LoaderTimeInitUSec: %m");
+
+ r = read_usec(EFI_LOADER_VARIABLE(LoaderTimeExecUSec), &y);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read LoaderTimeExecUSec: %m");
+
+ if (y == 0 || y < x || y - x > USEC_PER_HOUR)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO),
+ "Bad LoaderTimeInitUSec=%"PRIu64", LoaderTimeExecUSec=%" PRIu64"; refusing.",
+ x, y);
+
+ *ret_firmware = x;
+ *ret_loader = y;
+ return 0;
+}
+
+int efi_loader_get_device_part_uuid(sd_id128_t *ret) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+ unsigned parsed[16];
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderDevicePartUUID), &p);
+ if (r < 0)
+ return r;
+
+ if (sscanf(p, SD_ID128_UUID_FORMAT_STR,
+ &parsed[0], &parsed[1], &parsed[2], &parsed[3],
+ &parsed[4], &parsed[5], &parsed[6], &parsed[7],
+ &parsed[8], &parsed[9], &parsed[10], &parsed[11],
+ &parsed[12], &parsed[13], &parsed[14], &parsed[15]) != 16)
+ return -EIO;
+
+ if (ret)
+ for (unsigned i = 0; i < ELEMENTSOF(parsed); i++)
+ ret->bytes[i] = parsed[i];
+
+ return 0;
+}
+
+int efi_loader_get_entries(char ***ret) {
+ _cleanup_free_ char16_t *entries = NULL;
+ _cleanup_strv_free_ char **l = NULL;
+ size_t size;
+ int r;
+
+ assert(ret);
+
+ if (!is_efi_boot())
+ return -EOPNOTSUPP;
+
+ r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderEntries), NULL, (void**) &entries, &size);
+ if (r < 0)
+ return r;
+
+ /* The variable contains a series of individually NUL terminated UTF-16 strings. We gracefully
+ * consider the final NUL byte optional (i.e. the last string may or may not end in a NUL byte).*/
+
+ for (size_t i = 0, start = 0;; i++) {
+ _cleanup_free_ char *decoded = NULL;
+ bool end;
+
+ /* Is this the end of the variable's data? */
+ end = i * sizeof(char16_t) >= size;
+
+ /* Are we in the middle of a string? (i.e. not at the end of the variable, nor at a NUL terminator?) If
+ * so, let's go to the next entry. */
+ if (!end && entries[i] != 0)
+ continue;
+
+ /* Empty string at the end of variable? That's the trailer, we are done (i.e. we have a final
+ * NUL terminator). */
+ if (end && start == i)
+ break;
+
+ /* We reached the end of a string, let's decode it into UTF-8 */
+ decoded = utf16_to_utf8(entries + start, (i - start) * sizeof(char16_t));
+ if (!decoded)
+ return -ENOMEM;
+
+ if (efi_loader_entry_name_valid(decoded)) {
+ r = strv_consume(&l, TAKE_PTR(decoded));
+ if (r < 0)
+ return r;
+ } else
+ log_debug("Ignoring invalid loader entry '%s'.", decoded);
+
+ /* Exit the loop if we reached the end of the variable (i.e. we do not have a final NUL
+ * terminator) */
+ if (end)
+ break;
+
+ /* Continue after the NUL byte */
+ start = i + 1;
+ }
+
+ *ret = TAKE_PTR(l);
+ return 0;
+}
+
+int efi_loader_get_features(uint64_t *ret) {
+ _cleanup_free_ void *v = NULL;
+ size_t s;
+ int r;
+
+ assert(ret);
+
+ if (!is_efi_boot()) {
+ *ret = 0;
+ return 0;
+ }
+
+ r = efi_get_variable(EFI_LOADER_VARIABLE(LoaderFeatures), NULL, &v, &s);
+ if (r == -ENOENT) {
+ _cleanup_free_ char *info = NULL;
+
+ /* The new (v240+) LoaderFeatures variable is not supported, let's see if it's systemd-boot at all */
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderInfo), &info);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return r;
+
+ /* Variable not set, definitely means not systemd-boot */
+
+ } else if (first_word(info, "systemd-boot")) {
+
+ /* An older systemd-boot version. Let's hardcode the feature set, since it was pretty
+ * static in all its versions. */
+
+ *ret = EFI_LOADER_FEATURE_CONFIG_TIMEOUT |
+ EFI_LOADER_FEATURE_ENTRY_DEFAULT |
+ EFI_LOADER_FEATURE_ENTRY_ONESHOT;
+
+ return 0;
+ }
+
+ /* No features supported */
+ *ret = 0;
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ if (s != sizeof(uint64_t))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "LoaderFeatures EFI variable doesn't have the right size.");
+
+ memcpy(ret, v, sizeof(uint64_t));
+ return 0;
+}
+
+int efi_stub_get_features(uint64_t *ret) {
+ _cleanup_free_ void *v = NULL;
+ size_t s;
+ int r;
+
+ assert(ret);
+
+ if (!is_efi_boot()) {
+ *ret = 0;
+ return 0;
+ }
+
+ r = efi_get_variable(EFI_LOADER_VARIABLE(StubFeatures), NULL, &v, &s);
+ if (r == -ENOENT) {
+ _cleanup_free_ char *info = NULL;
+
+ /* The new (v252+) StubFeatures variable is not supported, let's see if it's systemd-stub at all */
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(StubInfo), &info);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return r;
+
+ /* Variable not set, definitely means not systemd-stub */
+
+ } else if (first_word(info, "systemd-stub")) {
+
+ /* An older systemd-stub version. Let's hardcode the feature set, since it was pretty
+ * static in all its versions. */
+
+ *ret = EFI_STUB_FEATURE_REPORT_BOOT_PARTITION;
+ return 0;
+ }
+
+ /* No features supported */
+ *ret = 0;
+ return 0;
+ }
+ if (r < 0)
+ return r;
+
+ if (s != sizeof(uint64_t))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "StubFeatures EFI variable doesn't have the right size.");
+
+ memcpy(ret, v, sizeof(uint64_t));
+ return 0;
+}
+
+int efi_measured_uki(int log_level) {
+ _cleanup_free_ char *pcr_string = NULL;
+ static int cached = -1;
+ unsigned pcr_nr;
+ int r;
+
+ if (cached >= 0)
+ return cached;
+
+ /* Checks if we are booted on a kernel with sd-stub which measured the kernel into PCR 11 on a TPM2
+ * chip. Or in other words, if we are running on a TPM enabled UKI. (TPM 1.2 situations are ignored.)
+ *
+ * Returns == 0 and > 0 depending on the result of the test. Returns -EREMOTE if we detected a stub
+ * being used, but it measured things into a different PCR than we are configured for in
+ * userspace. (i.e. we expect PCR 11 being used for this by both sd-stub and us) */
+
+ r = getenv_bool_secure("SYSTEMD_FORCE_MEASURE"); /* Give user a chance to override the variable test,
+ * for debugging purposes */
+ if (r >= 0)
+ return (cached = r);
+ if (r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_FORCE_MEASURE, ignoring: %m");
+
+ if (!efi_has_tpm2())
+ return (cached = 0);
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(StubPcrKernelImage), &pcr_string);
+ if (r == -ENOENT)
+ return (cached = 0);
+ if (r < 0)
+ return log_full_errno(log_level, r,
+ "Failed to get StubPcrKernelImage EFI variable: %m");
+
+ r = safe_atou(pcr_string, &pcr_nr);
+ if (r < 0)
+ return log_full_errno(log_level, r,
+ "Failed to parse StubPcrKernelImage EFI variable: %s", pcr_string);
+ if (pcr_nr != TPM2_PCR_KERNEL_BOOT)
+ return log_full_errno(log_level, SYNTHETIC_ERRNO(EREMOTE),
+ "Kernel stub measured kernel image into PCR %u, which is different than expected %i.",
+ pcr_nr, TPM2_PCR_KERNEL_BOOT);
+
+ return (cached = 1);
+}
+
+int efi_loader_get_config_timeout_one_shot(usec_t *ret) {
+ _cleanup_free_ char *v = NULL;
+ static struct stat cache_stat = {};
+ struct stat new_stat;
+ static usec_t cache;
+ uint64_t sec;
+ int r;
+
+ assert(ret);
+
+ /* stat() the EFI variable, to see if the mtime changed. If it did, we need to cache again. */
+ if (stat(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot)), &new_stat) < 0)
+ return -errno;
+
+ if (stat_inode_unmodified(&new_stat, &cache_stat)) {
+ *ret = cache;
+ return 0;
+ }
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderConfigTimeoutOneShot), &v);
+ if (r < 0)
+ return r;
+
+ r = safe_atou64(v, &sec);
+ if (r < 0)
+ return r;
+ if (sec > USEC_INFINITY / USEC_PER_SEC)
+ return -ERANGE;
+
+ cache_stat = new_stat;
+ *ret = cache = sec * USEC_PER_SEC; /* return in μs */
+ return 0;
+}
+
+int efi_loader_update_entry_one_shot_cache(char **cache, struct stat *cache_stat) {
+ _cleanup_free_ char *v = NULL;
+ struct stat new_stat;
+ int r;
+
+ assert(cache);
+ assert(cache_stat);
+
+ /* stat() the EFI variable, to see if the mtime changed. If it did we need to cache again. */
+ if (stat(EFIVAR_PATH(EFI_LOADER_VARIABLE(LoaderEntryOneShot)), &new_stat) < 0)
+ return -errno;
+
+ if (stat_inode_unmodified(&new_stat, cache_stat))
+ return 0;
+
+ r = efi_get_variable_string(EFI_LOADER_VARIABLE(LoaderEntryOneShot), &v);
+ if (r < 0)
+ return r;
+
+ if (!efi_loader_entry_name_valid(v))
+ return -EINVAL;
+
+ *cache_stat = new_stat;
+ free_and_replace(*cache, v);
+
+ return 0;
+}
+
+#endif
+
+bool efi_loader_entry_name_valid(const char *s) {
+ if (!filename_is_valid(s)) /* Make sure entry names fit in filenames */
+ return false;
+
+ return in_charset(s, ALPHANUMERICAL "+-_.");
+}
diff --git a/src/shared/efi-loader.h b/src/shared/efi-loader.h
new file mode 100644
index 0000000..c878eea
--- /dev/null
+++ b/src/shared/efi-loader.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/stat.h>
+
+#include "efivars-fundamental.h"
+#include "efivars.h"
+
+/* Various calls that interface with EFI variables implementing https://systemd.io/BOOT_LOADER_INTERFACE */
+
+#if ENABLE_EFI
+
+int efi_loader_get_device_part_uuid(sd_id128_t *ret);
+int efi_loader_get_boot_usec(usec_t *ret_firmware, usec_t *ret_loader);
+
+int efi_loader_get_entries(char ***ret);
+
+int efi_loader_get_features(uint64_t *ret);
+int efi_stub_get_features(uint64_t *ret);
+
+int efi_measured_uki(int log_level);
+
+int efi_loader_get_config_timeout_one_shot(usec_t *ret);
+int efi_loader_update_entry_one_shot_cache(char **cache, struct stat *cache_stat);
+
+#else
+
+static inline int efi_loader_get_device_part_uuid(sd_id128_t *u) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_loader_get_boot_usec(usec_t *firmware, usec_t *loader) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_loader_get_entries(char ***ret) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_loader_get_features(uint64_t *ret) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_stub_get_features(uint64_t *ret) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_measured_uki(int log_level) {
+ return log_full_errno(log_level, SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Compiled without support for EFI");
+}
+
+static inline int efi_loader_get_config_timeout_one_shot(usec_t *ret) {
+ return -EOPNOTSUPP;
+}
+
+static inline int efi_loader_update_entry_one_shot_cache(char **cache, struct stat *cache_stat) {
+ return -EOPNOTSUPP;
+}
+
+#endif
+
+bool efi_loader_entry_name_valid(const char *s);
diff --git a/src/shared/elf-util.c b/src/shared/elf-util.c
new file mode 100644
index 0000000..24ed16e
--- /dev/null
+++ b/src/shared/elf-util.c
@@ -0,0 +1,899 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_ELFUTILS
+
+#include <dwarf.h>
+#include <elfutils/libdwelf.h>
+#include <elfutils/libdwfl.h>
+#include <libelf.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "dlfcn-util.h"
+#include "elf-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "hexdecoct.h"
+#include "io-util.h"
+#include "macro.h"
+#include "memstream-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "string-util.h"
+
+#define FRAMES_MAX 64
+#define THREADS_MAX 64
+#define ELF_PACKAGE_METADATA_ID 0xcafe1a7e
+
+/* The amount of data we're willing to write to each of the output pipes. */
+#define COREDUMP_PIPE_MAX (1024*1024U)
+
+static void *dw_dl = NULL;
+static void *elf_dl = NULL;
+
+/* libdw symbols */
+Dwarf_Attribute *(*sym_dwarf_attr_integrate)(Dwarf_Die *, unsigned int, Dwarf_Attribute *);
+const char *(*sym_dwarf_diename)(Dwarf_Die *);
+const char *(*sym_dwarf_formstring)(Dwarf_Attribute *);
+int (*sym_dwarf_getscopes)(Dwarf_Die *, Dwarf_Addr, Dwarf_Die **);
+int (*sym_dwarf_getscopes_die)(Dwarf_Die *, Dwarf_Die **);
+Elf *(*sym_dwelf_elf_begin)(int);
+#if HAVE_DWELF_ELF_E_MACHINE_STRING
+const char *(*sym_dwelf_elf_e_machine_string)(int);
+#endif
+ssize_t (*sym_dwelf_elf_gnu_build_id)(Elf *, const void **);
+int (*sym_dwarf_tag)(Dwarf_Die *);
+Dwfl_Module *(*sym_dwfl_addrmodule)(Dwfl *, Dwarf_Addr);
+Dwfl *(*sym_dwfl_begin)(const Dwfl_Callbacks *);
+int (*sym_dwfl_build_id_find_elf)(Dwfl_Module *, void **, const char *, Dwarf_Addr, char **, Elf **);
+int (*sym_dwfl_core_file_attach)(Dwfl *, Elf *);
+int (*sym_dwfl_core_file_report)(Dwfl *, Elf *, const char *);
+void (*sym_dwfl_end)(Dwfl *);
+const char *(*sym_dwfl_errmsg)(int);
+int (*sym_dwfl_errno)(void);
+bool (*sym_dwfl_frame_pc)(Dwfl_Frame *, Dwarf_Addr *, bool *);
+ptrdiff_t (*sym_dwfl_getmodules)(Dwfl *, int (*)(Dwfl_Module *, void **, const char *, Dwarf_Addr, void *), void *, ptrdiff_t);
+int (*sym_dwfl_getthreads)(Dwfl *, int (*)(Dwfl_Thread *, void *), void *);
+Dwarf_Die *(*sym_dwfl_module_addrdie)(Dwfl_Module *, Dwarf_Addr, Dwarf_Addr *);
+const char *(*sym_dwfl_module_addrname)(Dwfl_Module *, GElf_Addr);
+int (*sym_dwfl_module_build_id)(Dwfl_Module *, const unsigned char **, GElf_Addr *);
+Elf *(*sym_dwfl_module_getelf)(Dwfl_Module *, GElf_Addr *);
+const char *(*sym_dwfl_module_info)(Dwfl_Module *, void ***, Dwarf_Addr *, Dwarf_Addr *, Dwarf_Addr *, Dwarf_Addr *, const char **, const char **);
+int (*sym_dwfl_offline_section_address)(Dwfl_Module *, void **, const char *, Dwarf_Addr, const char *, GElf_Word, const GElf_Shdr *, Dwarf_Addr *);
+int (*sym_dwfl_report_end)(Dwfl *, int (*)(Dwfl_Module *, void *, const char *, Dwarf_Addr, void *), void *);
+int (*sym_dwfl_standard_find_debuginfo)(Dwfl_Module *, void **, const char *, Dwarf_Addr, const char *, const char *, GElf_Word, char **);
+int (*sym_dwfl_thread_getframes)(Dwfl_Thread *, int (*)(Dwfl_Frame *, void *), void *);
+pid_t (*sym_dwfl_thread_tid)(Dwfl_Thread *);
+
+/* libelf symbols */
+Elf *(*sym_elf_begin)(int, Elf_Cmd, Elf *);
+int (*sym_elf_end)(Elf *);
+Elf_Data *(*sym_elf_getdata_rawchunk)(Elf *, int64_t, size_t, Elf_Type);
+GElf_Ehdr *(*sym_gelf_getehdr)(Elf *, GElf_Ehdr *);
+int (*sym_elf_getphdrnum)(Elf *, size_t *);
+const char *(*sym_elf_errmsg)(int);
+int (*sym_elf_errno)(void);
+Elf *(*sym_elf_memory)(char *, size_t);
+unsigned int (*sym_elf_version)(unsigned int);
+GElf_Phdr *(*sym_gelf_getphdr)(Elf *, int, GElf_Phdr *);
+size_t (*sym_gelf_getnote)(Elf_Data *, size_t, GElf_Nhdr *, size_t *, size_t *);
+
+int dlopen_dw(void) {
+ int r;
+
+ r = dlopen_many_sym_or_warn(
+ &dw_dl, "libdw.so.1", LOG_DEBUG,
+ DLSYM_ARG(dwarf_getscopes),
+ DLSYM_ARG(dwarf_getscopes_die),
+ DLSYM_ARG(dwarf_tag),
+ DLSYM_ARG(dwarf_attr_integrate),
+ DLSYM_ARG(dwarf_formstring),
+ DLSYM_ARG(dwarf_diename),
+ DLSYM_ARG(dwelf_elf_gnu_build_id),
+ DLSYM_ARG(dwelf_elf_begin),
+#if HAVE_DWELF_ELF_E_MACHINE_STRING
+ DLSYM_ARG(dwelf_elf_e_machine_string),
+#endif
+ DLSYM_ARG(dwfl_addrmodule),
+ DLSYM_ARG(dwfl_frame_pc),
+ DLSYM_ARG(dwfl_module_addrdie),
+ DLSYM_ARG(dwfl_module_addrname),
+ DLSYM_ARG(dwfl_module_info),
+ DLSYM_ARG(dwfl_module_build_id),
+ DLSYM_ARG(dwfl_module_getelf),
+ DLSYM_ARG(dwfl_begin),
+ DLSYM_ARG(dwfl_core_file_report),
+ DLSYM_ARG(dwfl_report_end),
+ DLSYM_ARG(dwfl_getmodules),
+ DLSYM_ARG(dwfl_core_file_attach),
+ DLSYM_ARG(dwfl_end),
+ DLSYM_ARG(dwfl_errmsg),
+ DLSYM_ARG(dwfl_errno),
+ DLSYM_ARG(dwfl_build_id_find_elf),
+ DLSYM_ARG(dwfl_standard_find_debuginfo),
+ DLSYM_ARG(dwfl_thread_tid),
+ DLSYM_ARG(dwfl_thread_getframes),
+ DLSYM_ARG(dwfl_getthreads),
+ DLSYM_ARG(dwfl_offline_section_address));
+ if (r <= 0)
+ return r;
+
+ return 1;
+}
+
+int dlopen_elf(void) {
+ int r;
+
+ r = dlopen_many_sym_or_warn(
+ &elf_dl, "libelf.so.1", LOG_DEBUG,
+ DLSYM_ARG(elf_begin),
+ DLSYM_ARG(elf_end),
+ DLSYM_ARG(elf_getphdrnum),
+ DLSYM_ARG(elf_getdata_rawchunk),
+ DLSYM_ARG(elf_errmsg),
+ DLSYM_ARG(elf_errno),
+ DLSYM_ARG(elf_memory),
+ DLSYM_ARG(elf_version),
+ DLSYM_ARG(gelf_getehdr),
+ DLSYM_ARG(gelf_getphdr),
+ DLSYM_ARG(gelf_getnote));
+ if (r <= 0)
+ return r;
+
+ return 1;
+}
+
+typedef struct StackContext {
+ MemStream m;
+ Dwfl *dwfl;
+ Elf *elf;
+ unsigned n_thread;
+ unsigned n_frame;
+ JsonVariant **package_metadata;
+ Set **modules;
+} StackContext;
+
+static void stack_context_done(StackContext *c) {
+ assert(c);
+
+ memstream_done(&c->m);
+
+ if (c->dwfl) {
+ sym_dwfl_end(c->dwfl);
+ c->dwfl = NULL;
+ }
+
+ if (c->elf) {
+ sym_elf_end(c->elf);
+ c->elf = NULL;
+ }
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(Elf *, sym_elf_end, NULL);
+
+static int frame_callback(Dwfl_Frame *frame, void *userdata) {
+ StackContext *c = ASSERT_PTR(userdata);
+ Dwarf_Addr pc, pc_adjusted;
+ const char *fname = NULL, *symbol = NULL;
+ Dwfl_Module *module;
+ bool is_activation;
+ uint64_t module_offset = 0;
+
+ assert(frame);
+
+ if (c->n_frame >= FRAMES_MAX)
+ return DWARF_CB_ABORT;
+
+ if (!sym_dwfl_frame_pc(frame, &pc, &is_activation))
+ return DWARF_CB_ABORT;
+
+ pc_adjusted = pc - (is_activation ? 0 : 1);
+
+ module = sym_dwfl_addrmodule(c->dwfl, pc_adjusted);
+ if (module) {
+ Dwarf_Addr start, bias = 0;
+ Dwarf_Die *cudie;
+
+ cudie = sym_dwfl_module_addrdie(module, pc_adjusted, &bias);
+ if (cudie) {
+ _cleanup_free_ Dwarf_Die *scopes = NULL;
+ int n;
+
+ n = sym_dwarf_getscopes(cudie, pc_adjusted - bias, &scopes);
+ if (n > 0)
+ for (Dwarf_Die *s = scopes; s && s < scopes + n; s++) {
+ Dwarf_Attribute *a, space;
+
+ if (!IN_SET(sym_dwarf_tag(s), DW_TAG_subprogram, DW_TAG_inlined_subroutine, DW_TAG_entry_point))
+ continue;
+
+ a = sym_dwarf_attr_integrate(s, DW_AT_MIPS_linkage_name, &space);
+ if (!a)
+ a = sym_dwarf_attr_integrate(s, DW_AT_linkage_name, &space);
+ if (a)
+ symbol = sym_dwarf_formstring(a);
+ if (!symbol)
+ symbol = sym_dwarf_diename(s);
+
+ if (symbol)
+ break;
+ }
+ }
+
+ if (!symbol)
+ symbol = sym_dwfl_module_addrname(module, pc_adjusted);
+
+ fname = sym_dwfl_module_info(module, NULL, &start, NULL, NULL, NULL, NULL, NULL);
+ module_offset = pc - start;
+ }
+
+ if (c->m.f)
+ fprintf(c->m.f, "#%-2u 0x%016" PRIx64 " %s (%s + 0x%" PRIx64 ")\n", c->n_frame, (uint64_t) pc, strna(symbol), strna(fname), module_offset);
+ c->n_frame++;
+
+ return DWARF_CB_OK;
+}
+
+static int thread_callback(Dwfl_Thread *thread, void *userdata) {
+ StackContext *c = ASSERT_PTR(userdata);
+ pid_t tid;
+
+ assert(thread);
+
+ if (c->n_thread >= THREADS_MAX)
+ return DWARF_CB_ABORT;
+
+ if (c->n_thread != 0 && c->m.f)
+ fputc('\n', c->m.f);
+
+ c->n_frame = 0;
+
+ if (c->m.f) {
+ tid = sym_dwfl_thread_tid(thread);
+ fprintf(c->m.f, "Stack trace of thread " PID_FMT ":\n", tid);
+ }
+
+ if (sym_dwfl_thread_getframes(thread, frame_callback, c) < 0)
+ return DWARF_CB_ABORT;
+
+ c->n_thread++;
+
+ return DWARF_CB_OK;
+}
+
+static char* build_package_reference(
+ const char *type,
+ const char *name,
+ const char *version,
+ const char *arch) {
+
+ /* Construct an identifier for a specific version of the package. The syntax is most suitable for
+ * rpm: the resulting string can be used directly in queries and rpm/dnf/yum commands. For dpkg and
+ * other systems, it might not be usable directly, but users should still be able to figure out the
+ * meaning.
+ */
+
+ return strjoin(type ?: "package",
+ " ",
+ name,
+
+ version ? "-" : "",
+ strempty(version),
+
+ /* arch is meaningful even without version, so always print it */
+ arch ? "." : "",
+ strempty(arch));
+}
+
+static void report_module_metadata(StackContext *c, const char *name, JsonVariant *metadata) {
+ assert(c);
+ assert(name);
+
+ if (!c->m.f)
+ return;
+
+ fprintf(c->m.f, "Module %s", name);
+
+ if (metadata) {
+ const char
+ *build_id = json_variant_string(json_variant_by_key(metadata, "buildId")),
+ *type = json_variant_string(json_variant_by_key(metadata, "type")),
+ *package = json_variant_string(json_variant_by_key(metadata, "name")),
+ *version = json_variant_string(json_variant_by_key(metadata, "version")),
+ *arch = json_variant_string(json_variant_by_key(metadata, "architecture"));
+
+ if (package) {
+ /* Version/architecture is only meaningful with a package name.
+ * Skip the detailed fields if package is unknown. */
+ _cleanup_free_ char *id = build_package_reference(type, package, version, arch);
+ fprintf(c->m.f, " from %s", strnull(id));
+ }
+
+ if (build_id && !(package && version))
+ fprintf(c->m.f, ", build-id=%s", build_id);
+ }
+
+ fputs("\n", c->m.f);
+}
+
+static int parse_package_metadata(const char *name, JsonVariant *id_json, Elf *elf, bool *ret_interpreter_found, StackContext *c) {
+ bool interpreter_found = false;
+ size_t n_program_headers;
+ int r;
+
+ assert(name);
+ assert(elf);
+ assert(c);
+
+ /* When iterating over PT_LOAD we will visit modules more than once */
+ if (set_contains(*c->modules, name))
+ return 0;
+
+ r = sym_elf_getphdrnum(elf, &n_program_headers);
+ if (r < 0) /* Not the handle we are looking for - that's ok, skip it */
+ return 0;
+
+ /* Iterate over all program headers in that ELF object. These will have been copied by
+ * the kernel verbatim when the core file is generated. */
+ for (size_t i = 0; i < n_program_headers; ++i) {
+ GElf_Phdr mem, *program_header;
+ GElf_Nhdr note_header;
+ Elf_Data *data;
+
+ /* Package metadata is in PT_NOTE headers. */
+ program_header = sym_gelf_getphdr(elf, i, &mem);
+ if (!program_header || (program_header->p_type != PT_NOTE && program_header->p_type != PT_INTERP))
+ continue;
+
+ if (program_header->p_type == PT_INTERP) {
+ interpreter_found = true;
+ continue;
+ }
+
+ /* Fortunately there is an iterator we can use to walk over the
+ * elements of a PT_NOTE program header. We are interested in the
+ * note with type. */
+ data = sym_elf_getdata_rawchunk(elf,
+ program_header->p_offset,
+ program_header->p_filesz,
+ ELF_T_NHDR);
+ if (!data)
+ continue;
+
+ for (size_t note_offset = 0, name_offset, desc_offset;
+ note_offset < data->d_size &&
+ (note_offset = sym_gelf_getnote(data, note_offset, &note_header, &name_offset, &desc_offset)) > 0;) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *w = NULL;
+ const char *payload = (const char *)data->d_buf + desc_offset;
+
+ if (note_header.n_namesz == 0 || note_header.n_descsz == 0)
+ continue;
+
+ /* Package metadata might have different owners, but the
+ * magic ID is always the same. */
+ if (note_header.n_type != ELF_PACKAGE_METADATA_ID)
+ continue;
+
+ _cleanup_free_ char *payload_0suffixed = NULL;
+ assert(note_offset > desc_offset);
+ size_t payload_len = note_offset - desc_offset;
+
+ /* If we are lucky and the payload is NUL-padded, we don't need to copy the string.
+ * But if happens to go all the way until the end of the buffer, make a copy. */
+ if (payload[payload_len-1] != '\0') {
+ payload_0suffixed = memdup_suffix0(payload, payload_len);
+ if (!payload_0suffixed)
+ return log_oom();
+ payload = payload_0suffixed;
+ }
+
+ r = json_parse(payload, 0, &v, NULL, NULL);
+ if (r < 0) {
+ _cleanup_free_ char *esc = cescape(payload);
+ return log_error_errno(r, "json_parse on \"%s\" failed: %m", strnull(esc));
+ }
+
+ /* If we have a build-id, merge it in the same JSON object so that it appears all
+ * nicely together in the logs/metadata. */
+ if (id_json) {
+ r = json_variant_merge_object(&v, id_json);
+ if (r < 0)
+ return log_error_errno(r, "json_variant_merge of package meta with buildId failed: %m");
+ }
+
+ /* Pretty-print to the buffer, so that the metadata goes as plaintext in the
+ * journal. */
+ report_module_metadata(c, name, v);
+
+ /* Then we build a new object using the module name as the key, and merge it
+ * with the previous parses, so that in the end it all fits together in a single
+ * JSON blob. */
+ r = json_build(&w, JSON_BUILD_OBJECT(JSON_BUILD_PAIR(name, JSON_BUILD_VARIANT(v))));
+ if (r < 0)
+ return log_error_errno(r, "Failed to build JSON object: %m");
+
+ r = json_variant_merge_object(c->package_metadata, w);
+ if (r < 0)
+ return log_error_errno(r, "json_variant_merge of package meta with buildId failed: %m");
+
+ /* Finally stash the name, so we avoid double visits. */
+ r = set_put_strdup(c->modules, name);
+ if (r < 0)
+ return log_error_errno(r, "set_put_strdup failed: %m");
+
+ if (ret_interpreter_found)
+ *ret_interpreter_found = interpreter_found;
+
+ return 1;
+ }
+ }
+
+ if (ret_interpreter_found)
+ *ret_interpreter_found = interpreter_found;
+
+ /* Didn't find package metadata for this module - that's ok, just go to the next. */
+ return 0;
+}
+
+/* Get the build-id out of an ELF object or a dwarf core module. */
+static int parse_buildid(Dwfl_Module *mod, Elf *elf, const char *name, StackContext *c, JsonVariant **ret_id_json) {
+ _cleanup_(json_variant_unrefp) JsonVariant *id_json = NULL;
+ const unsigned char *id;
+ GElf_Addr id_vaddr;
+ ssize_t id_len;
+ int r;
+
+ assert(mod || elf);
+ assert(name);
+ assert(c);
+
+ if (mod)
+ id_len = sym_dwfl_module_build_id(mod, &id, &id_vaddr);
+ else
+ id_len = sym_dwelf_elf_gnu_build_id(elf, (const void **)&id);
+ if (id_len <= 0) {
+ /* If we don't find a build-id, note it in the journal message, and try
+ * anyway to find the package metadata. It's unlikely to have the latter
+ * without the former, but there's no hard rule. */
+ if (c->m.f)
+ fprintf(c->m.f, "Module %s without build-id.\n", name);
+ } else {
+ /* We will later parse package metadata json and pass it to our caller. Prepare the
+ * build-id in json format too, so that it can be appended and parsed cleanly. It
+ * will then be added as metadata to the journal message with the stack trace. */
+ r = json_build(&id_json, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("buildId", JSON_BUILD_HEX(id, id_len))));
+ if (r < 0)
+ return log_error_errno(r, "json_build on buildId failed: %m");
+ }
+
+ if (ret_id_json)
+ *ret_id_json = TAKE_PTR(id_json);
+
+ return 0;
+}
+
+static int module_callback(Dwfl_Module *mod, void **userdata, const char *name, Dwarf_Addr start, void *arg) {
+ _cleanup_(json_variant_unrefp) JsonVariant *id_json = NULL;
+ StackContext *c = ASSERT_PTR(arg);
+ size_t n_program_headers;
+ GElf_Addr bias;
+ int r;
+ Elf *elf;
+
+ assert(mod);
+
+ if (!name)
+ name = "(unnamed)"; /* For logging purposes */
+
+ /* We are iterating on each "module", which is what dwfl calls ELF objects contained in the
+ * core file, and extracting the build-id first and then the package metadata.
+ * We proceed in a best-effort fashion - not all ELF objects might contain both or either.
+ * The build-id is easy, as libdwfl parses it during the sym_dwfl_core_file_report() call and
+ * stores it separately in an internal library struct. */
+ r = parse_buildid(mod, NULL, name, c, &id_json);
+ if (r < 0)
+ return DWARF_CB_ABORT;
+
+ /* The .note.package metadata is more difficult. From the module, we need to get a reference
+ * to the ELF object first. We might be lucky and just get it from elfutils. */
+ elf = sym_dwfl_module_getelf(mod, &bias);
+ if (elf) {
+ r = parse_package_metadata(name, id_json, elf, NULL, c);
+ if (r < 0)
+ return DWARF_CB_ABORT;
+ if (r > 0)
+ return DWARF_CB_OK;
+ } else
+ elf = c->elf;
+
+ /* We did not get the ELF object, or it's just a reference to the core. That is likely
+ * because we didn't get direct access to the executable, and the version of elfutils does
+ * not yet support parsing it out of the core file directly.
+ * So fallback to manual extraction - get the PT_LOAD section from the core,
+ * and if it's the right one we can interpret it as an Elf object, and parse
+ * its notes manually. */
+
+ r = sym_elf_getphdrnum(elf, &n_program_headers);
+ if (r < 0) {
+ log_warning("Could not parse number of program headers from core file: %s",
+ sym_elf_errmsg(-1)); /* -1 retrieves the most recent error */
+ report_module_metadata(c, name, id_json);
+
+ return DWARF_CB_OK;
+ }
+
+ for (size_t i = 0; i < n_program_headers; ++i) {
+ GElf_Phdr mem, *program_header;
+ Elf_Data *data;
+ GElf_Addr end_of_segment;
+
+ /* The core file stores the ELF files in the PT_LOAD segment. */
+ program_header = sym_gelf_getphdr(elf, i, &mem);
+ if (!program_header || program_header->p_type != PT_LOAD)
+ continue;
+
+ /* Check that the end of segment is a valid address. */
+ if (__builtin_add_overflow(program_header->p_vaddr, program_header->p_memsz, &end_of_segment)) {
+ log_error("Abort due to corrupted core dump, end of segment address %#zx + %#zx overflows", (size_t)program_header->p_vaddr, (size_t)program_header->p_memsz);
+ return DWARF_CB_ABORT;
+ }
+
+ /* This PT_LOAD segment doesn't contain the start address, so it can't be the module we are looking for. */
+ if (start < program_header->p_vaddr || start >= end_of_segment)
+ continue;
+
+ /* Now get a usable Elf reference, and parse the notes from it. */
+ data = sym_elf_getdata_rawchunk(elf,
+ program_header->p_offset,
+ program_header->p_filesz,
+ ELF_T_NHDR);
+ if (!data)
+ continue;
+
+ _cleanup_(sym_elf_endp) Elf *memelf = sym_elf_memory(data->d_buf, data->d_size);
+ if (!memelf)
+ continue;
+ r = parse_package_metadata(name, id_json, memelf, NULL, c);
+ if (r < 0)
+ return DWARF_CB_ABORT;
+ if (r > 0)
+ break;
+ }
+
+ return DWARF_CB_OK;
+}
+
+static int parse_core(int fd, const char *executable, char **ret, JsonVariant **ret_package_metadata) {
+
+ const Dwfl_Callbacks callbacks = {
+ .find_elf = sym_dwfl_build_id_find_elf,
+ .section_address = sym_dwfl_offline_section_address,
+ .find_debuginfo = sym_dwfl_standard_find_debuginfo,
+ };
+
+ _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL;
+ _cleanup_set_free_ Set *modules = NULL;
+ _cleanup_(stack_context_done) StackContext c = {
+ .package_metadata = &package_metadata,
+ .modules = &modules,
+ };
+ int r;
+
+ assert(fd >= 0);
+
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ return log_warning_errno(errno, "Failed to seek to beginning of the core file: %m");
+
+ if (ret && !memstream_init(&c.m))
+ return log_oom();
+
+ sym_elf_version(EV_CURRENT);
+
+ c.elf = sym_elf_begin(fd, ELF_C_READ_MMAP, NULL);
+ if (!c.elf)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, elf_begin() failed: %s", sym_elf_errmsg(sym_elf_errno()));
+
+ c.dwfl = sym_dwfl_begin(&callbacks);
+ if (!c.dwfl)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_begin() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno()));
+
+ if (sym_dwfl_core_file_report(c.dwfl, c.elf, executable) < 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_core_file_report() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno()));
+
+ if (sym_dwfl_report_end(c.dwfl, NULL, NULL) != 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_report_end() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno()));
+
+ if (sym_dwfl_getmodules(c.dwfl, &module_callback, &c, 0) < 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_getmodules() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno()));
+
+ if (sym_dwfl_core_file_attach(c.dwfl, c.elf) < 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_core_file_attach() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno()));
+
+ if (sym_dwfl_getthreads(c.dwfl, thread_callback, &c) < 0)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse core file, dwfl_getthreads() failed: %s", sym_dwfl_errmsg(sym_dwfl_errno()));
+
+ if (ret) {
+ r = memstream_finalize(&c.m, ret, NULL);
+ if (r < 0)
+ return log_warning_errno(r, "Could not parse core file, flushing file buffer failed: %m");
+ }
+
+ if (ret_package_metadata)
+ *ret_package_metadata = TAKE_PTR(package_metadata);
+
+ return 0;
+}
+
+static int parse_elf(int fd, const char *executable, char **ret, JsonVariant **ret_package_metadata) {
+ _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL, *elf_metadata = NULL;
+ _cleanup_set_free_ Set *modules = NULL;
+ _cleanup_(stack_context_done) StackContext c = {
+ .package_metadata = &package_metadata,
+ .modules = &modules,
+ };
+ const char *elf_type;
+ GElf_Ehdr elf_header;
+ int r;
+
+ assert(fd >= 0);
+
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ return log_warning_errno(errno, "Failed to seek to beginning of the ELF file: %m");
+
+ if (ret && !memstream_init(&c.m))
+ return log_oom();
+
+ sym_elf_version(EV_CURRENT);
+
+ c.elf = sym_elf_begin(fd, ELF_C_READ_MMAP, NULL);
+ if (!c.elf)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse ELF file, elf_begin() failed: %s", sym_elf_errmsg(sym_elf_errno()));
+
+ if (!sym_gelf_getehdr(c.elf, &elf_header))
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Could not parse ELF file, gelf_getehdr() failed: %s", sym_elf_errmsg(sym_elf_errno()));
+
+ if (elf_header.e_type == ET_CORE) {
+ _cleanup_free_ char *out = NULL;
+
+ r = parse_core(fd, executable, ret ? &out : NULL, &package_metadata);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to inspect core file: %m");
+
+ if (out)
+ fprintf(c.m.f, "%s", out);
+
+ elf_type = "coredump";
+ } else {
+ _cleanup_(json_variant_unrefp) JsonVariant *id_json = NULL;
+ const char *e = executable ?: "(unnamed)";
+ bool interpreter_found = false;
+
+ r = parse_buildid(NULL, c.elf, e, &c, &id_json);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse build-id of ELF file: %m");
+
+ r = parse_package_metadata(e, id_json, c.elf, &interpreter_found, &c);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse package metadata of ELF file: %m");
+
+ /* If we found a build-id and nothing else, return at least that. */
+ if (!package_metadata && id_json) {
+ r = json_build(&package_metadata, JSON_BUILD_OBJECT(JSON_BUILD_PAIR(e, JSON_BUILD_VARIANT(id_json))));
+ if (r < 0)
+ return log_warning_errno(r, "Failed to build JSON object: %m");
+ }
+
+ if (interpreter_found)
+ elf_type = "executable";
+ else
+ elf_type = "library";
+ }
+
+ /* Note that e_type is always DYN for both executables and libraries, so we can't tell them apart from the header,
+ * but we will search for the PT_INTERP section when parsing the metadata. */
+ r = json_build(&elf_metadata, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("elfType", JSON_BUILD_STRING(elf_type))));
+ if (r < 0)
+ return log_warning_errno(r, "Failed to build JSON object: %m");
+
+#if HAVE_DWELF_ELF_E_MACHINE_STRING
+ const char *elf_architecture = sym_dwelf_elf_e_machine_string(elf_header.e_machine);
+ if (elf_architecture) {
+ _cleanup_(json_variant_unrefp) JsonVariant *json_architecture = NULL;
+
+ r = json_build(&json_architecture,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("elfArchitecture", JSON_BUILD_STRING(elf_architecture))));
+ if (r < 0)
+ return log_warning_errno(r, "Failed to build JSON object: %m");
+
+ r = json_variant_merge_object(&elf_metadata, json_architecture);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to merge JSON objects: %m");
+
+ if (ret)
+ fprintf(c.m.f, "ELF object binary architecture: %s\n", elf_architecture);
+ }
+#endif
+
+ /* We always at least have the ELF type, so merge that (and possibly the arch). */
+ r = json_variant_merge_object(&elf_metadata, package_metadata);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to merge JSON objects: %m");
+
+ if (ret) {
+ r = memstream_finalize(&c.m, ret, NULL);
+ if (r < 0)
+ return log_warning_errno(r, "Could not parse ELF file, flushing file buffer failed: %m");
+ }
+
+ if (ret_package_metadata)
+ *ret_package_metadata = TAKE_PTR(elf_metadata);
+
+ return 0;
+}
+
+int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) {
+ _cleanup_close_pair_ int error_pipe[2] = EBADF_PAIR,
+ return_pipe[2] = EBADF_PAIR,
+ json_pipe[2] = EBADF_PAIR;
+ _cleanup_(json_variant_unrefp) JsonVariant *package_metadata = NULL;
+ _cleanup_free_ char *buf = NULL;
+ int r;
+
+ assert(fd >= 0);
+
+ r = dlopen_dw();
+ if (r < 0)
+ return r;
+
+ r = dlopen_elf();
+ if (r < 0)
+ return r;
+
+ r = RET_NERRNO(pipe2(error_pipe, O_CLOEXEC|O_NONBLOCK));
+ if (r < 0)
+ return r;
+
+ if (ret) {
+ r = RET_NERRNO(pipe2(return_pipe, O_CLOEXEC|O_NONBLOCK));
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_package_metadata) {
+ r = RET_NERRNO(pipe2(json_pipe, O_CLOEXEC|O_NONBLOCK));
+ if (r < 0)
+ return r;
+ }
+
+ /* Parsing possibly malformed data is crash-happy, so fork. In case we crash,
+ * the core file will not be lost, and the messages will still be attached to
+ * the journal. Reading the elf object might be slow, but it still has an upper
+ * bound since the core files have an upper size limit. It's also not doing any
+ * system call or interacting with the system in any way, besides reading from
+ * the file descriptor and writing into these four pipes. */
+ r = safe_fork_full("(sd-parse-elf)",
+ NULL,
+ (int[]){ fd, error_pipe[1], return_pipe[1], json_pipe[1] },
+ 4,
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_USERNS|FORK_WAIT|FORK_REOPEN_LOG,
+ NULL);
+ if (r < 0) {
+ if (r == -EPROTO) { /* We should have the errno from the child, but don't clobber original error */
+ int e, k;
+
+ k = read(error_pipe[0], &e, sizeof(e));
+ if (k < 0 && errno != EAGAIN) /* Pipe is non-blocking, EAGAIN means there's nothing */
+ return -errno;
+ if (k == sizeof(e))
+ return e; /* propagate error sent to us from child */
+ if (k != 0)
+ return -EIO;
+ }
+
+ return r;
+ }
+ if (r == 0) {
+ /* We want to avoid loops, given this can be called from systemd-coredump */
+ if (fork_disable_dump) {
+ r = RET_NERRNO(prctl(PR_SET_DUMPABLE, 0));
+ if (r < 0)
+ goto child_fail;
+ }
+
+ r = parse_elf(fd, executable, ret ? &buf : NULL, ret_package_metadata ? &package_metadata : NULL);
+ if (r < 0)
+ goto child_fail;
+
+ if (buf) {
+ size_t len = strlen(buf);
+
+ if (len > COREDUMP_PIPE_MAX) {
+ /* This is iffy. A backtrace can be a few hundred kilobytes, but too much is
+ * too much. Let's log a warning and ignore the rest. */
+ log_warning("Generated backtrace is %zu bytes (more than the limit of %u bytes), backtrace will be truncated.",
+ len, COREDUMP_PIPE_MAX);
+ len = COREDUMP_PIPE_MAX;
+ }
+
+ /* Bump the space for the returned string.
+ * Failure is ignored, because partial output is still useful. */
+ (void) fcntl(return_pipe[1], F_SETPIPE_SZ, len);
+
+ r = loop_write(return_pipe[1], buf, len);
+ if (r == -EAGAIN)
+ log_warning("Write failed, backtrace will be truncated.");
+ else if (r < 0)
+ goto child_fail;
+
+ return_pipe[1] = safe_close(return_pipe[1]);
+ }
+
+ if (package_metadata) {
+ _cleanup_fclose_ FILE *json_out = NULL;
+
+ /* Bump the space for the returned string. We don't know how much space we'll need in
+ * advance, so we'll just try to write as much as possible and maybe fail later. */
+ (void) fcntl(json_pipe[1], F_SETPIPE_SZ, COREDUMP_PIPE_MAX);
+
+ json_out = take_fdopen(&json_pipe[1], "w");
+ if (!json_out) {
+ r = -errno;
+ goto child_fail;
+ }
+
+ r = json_variant_dump(package_metadata, JSON_FORMAT_FLUSH, json_out, NULL);
+ if (r < 0)
+ log_warning_errno(r, "Failed to write JSON package metadata, ignoring: %m");
+ }
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ (void) write(error_pipe[1], &r, sizeof(r));
+ _exit(EXIT_FAILURE);
+ }
+
+ error_pipe[1] = safe_close(error_pipe[1]);
+ return_pipe[1] = safe_close(return_pipe[1]);
+ json_pipe[1] = safe_close(json_pipe[1]);
+
+ if (ret) {
+ _cleanup_fclose_ FILE *in = NULL;
+
+ in = take_fdopen(&return_pipe[0], "r");
+ if (!in)
+ return -errno;
+
+ r = read_full_stream(in, &buf, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_package_metadata) {
+ _cleanup_fclose_ FILE *json_in = NULL;
+
+ json_in = take_fdopen(&json_pipe[0], "r");
+ if (!json_in)
+ return -errno;
+
+ r = json_parse_file(json_in, NULL, 0, &package_metadata, NULL, NULL);
+ if (r < 0 && r != -ENODATA) /* ENODATA: json was empty, so we got nothing, but that's ok */
+ log_warning_errno(r, "Failed to read or parse json metadata, ignoring: %m");
+ }
+
+ if (ret)
+ *ret = TAKE_PTR(buf);
+ if (ret_package_metadata)
+ *ret_package_metadata = TAKE_PTR(package_metadata);
+
+ return 0;
+}
+
+#endif
diff --git a/src/shared/elf-util.h b/src/shared/elf-util.h
new file mode 100644
index 0000000..b28e64c
--- /dev/null
+++ b/src/shared/elf-util.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+
+#if HAVE_ELFUTILS
+int dlopen_dw(void);
+int dlopen_elf(void);
+
+/* Parse an ELF object in a forked process, so that errors while iterating over
+ * untrusted and potentially malicious data do not propagate to the main caller's process.
+ * If fork_disable_dump, the child process will not dump core if it crashes. */
+int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata);
+#else
+static inline int parse_elf_object(int fd, const char *executable, bool fork_disable_dump, char **ret, JsonVariant **ret_package_metadata) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "elfutils disabled, parsing ELF objects not supported");
+}
+#endif
diff --git a/src/shared/enable-mempool.c b/src/shared/enable-mempool.c
new file mode 100644
index 0000000..fd582c0
--- /dev/null
+++ b/src/shared/enable-mempool.c
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdbool.h>
+
+#include "env-util.h"
+#include "mempool.h"
+#include "process-util.h"
+
+bool mempool_enabled(void) {
+ static int cache = -1;
+
+ if (!is_main_thread())
+ return false;
+
+ if (cache < 0)
+ cache = getenv_bool("SYSTEMD_MEMPOOL") != 0;
+
+ return cache;
+}
diff --git a/src/shared/env-file-label.c b/src/shared/env-file-label.c
new file mode 100644
index 0000000..5917b63
--- /dev/null
+++ b/src/shared/env-file-label.c
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+
+#include "env-file-label.h"
+#include "env-file.h"
+#include "selinux-util.h"
+
+int write_env_file_label(int dir_fd, const char *fname, char **headers, char **l) {
+ int r;
+
+ r = mac_selinux_create_file_prepare(fname, S_IFREG);
+ if (r < 0)
+ return r;
+
+ r = write_env_file(dir_fd, fname, headers, l);
+
+ mac_selinux_create_file_clear();
+
+ return r;
+}
+
+int write_vconsole_conf_label(char **l) {
+ int r;
+
+ r = mac_selinux_create_file_prepare("/etc/vconsole.conf", S_IFREG);
+ if (r < 0)
+ return r;
+
+ r = write_vconsole_conf(AT_FDCWD, "/etc/vconsole.conf", l);
+
+ mac_selinux_create_file_clear();
+
+ return r;
+}
diff --git a/src/shared/env-file-label.h b/src/shared/env-file-label.h
new file mode 100644
index 0000000..5ba45e4
--- /dev/null
+++ b/src/shared/env-file-label.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* These functions are split out of fileio.h (and not for example just flags to the functions they wrap) in order to
+ * optimize linking: This way, -lselinux is needed only for the callers of these functions that need selinux, but not
+ * for all */
+
+int write_env_file_label(int dir_fd, const char *fname, char **headers, char **l);
+
+int write_vconsole_conf_label(char **l);
diff --git a/src/shared/ethtool-link-mode.py b/src/shared/ethtool-link-mode.py
new file mode 100644
index 0000000..aac1576
--- /dev/null
+++ b/src/shared/ethtool-link-mode.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+import re
+import shlex
+import subprocess
+import sys
+
+OVERRIDES = {
+ 'autoneg' : 'autonegotiation',
+}
+
+mode, cpp, header = sys.argv[1:]
+xml = mode == '--xml'
+
+command = [*shlex.split(cpp), '-include', header, '-']
+out = subprocess.check_output(command, stdin=subprocess.DEVNULL, universal_newlines=True)
+
+lines = iter(out.splitlines())
+for line in lines:
+ if line.startswith('enum ethtool_link_mode_bit_indices {'):
+ break
+
+entries = []
+for line in lines:
+ if line.startswith('}'):
+ break
+ # ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0,
+ m = re.match(r'^\s*(ETHTOOL_LINK_MODE_((\d*).*)_BIT)\s*=\s*(\d+),', line)
+ if not m:
+ continue
+ enum, name, speed, value = m.groups()
+
+ name = name.lower().replace('_', '-')
+ name = OVERRIDES.get(name, name)
+
+ duplex = name.split('-')[-1].lower()
+ if duplex not in {'half', 'full'}:
+ duplex = ''
+
+ entries += [(enum, name, speed, value, duplex)]
+
+if xml:
+ print(' <tbody>')
+
+ entries.sort(key=lambda entry: (int(entry[2]) if entry[2] else 1e20, entry[4], entry[1], entry[3]))
+
+for enum, name, speed, value, duplex in entries:
+ if xml:
+ print(f'''\
+ <row><entry><option>{name}</option></entry>
+ <entry>{speed}</entry><entry>{duplex}</entry></row>
+ ''')
+ else:
+ enum = f'[{enum}]'
+ print(f' {enum:50} = "{name}",')
+
+if xml:
+ print(' </tbody>')
+
+assert len(entries) >= 99
diff --git a/src/shared/ethtool-util.c b/src/shared/ethtool-util.c
new file mode 100644
index 0000000..dce9e00
--- /dev/null
+++ b/src/shared/ethtool-util.c
@@ -0,0 +1,1423 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/sockios.h>
+
+#include "conf-parser.h"
+#include "ethtool-util.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "log.h"
+#include "memory-util.h"
+#include "socket-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "strxcpyx.h"
+
+static const char* const duplex_table[_DUP_MAX] = {
+ [DUP_FULL] = "full",
+ [DUP_HALF] = "half"
+};
+
+DEFINE_STRING_TABLE_LOOKUP(duplex, Duplex);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_duplex, duplex, Duplex, "Failed to parse duplex setting");
+
+static const struct {
+ uint32_t opt;
+ const char *name;
+} wol_option_map[] = {
+ { WAKE_PHY, "phy" },
+ { WAKE_UCAST, "unicast", },
+ { WAKE_MCAST, "multicast", },
+ { WAKE_BCAST, "broadcast", },
+ { WAKE_ARP, "arp", },
+ { WAKE_MAGIC, "magic", },
+ { WAKE_MAGICSECURE, "secureon", },
+};
+
+int wol_options_to_string_alloc(uint32_t opts, char **ret) {
+ _cleanup_free_ char *str = NULL;
+
+ assert(ret);
+
+ if (opts == UINT32_MAX) {
+ *ret = NULL;
+ return 0;
+ }
+
+ for (size_t i = 0; i < ELEMENTSOF(wol_option_map); i++)
+ if (opts & wol_option_map[i].opt &&
+ !strextend_with_separator(&str, ",", wol_option_map[i].name))
+ return -ENOMEM;
+
+ if (!str) {
+ str = strdup("off");
+ if (!str)
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(str);
+ return 1;
+}
+
+static const char* const port_table[] = {
+ [NET_DEV_PORT_TP] = "tp",
+ [NET_DEV_PORT_AUI] = "aui",
+ [NET_DEV_PORT_MII] = "mii",
+ [NET_DEV_PORT_FIBRE] = "fibre",
+ [NET_DEV_PORT_BNC] = "bnc",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(port, NetDevPort);
+DEFINE_CONFIG_PARSE_ENUM(config_parse_port, port, NetDevPort, "Failed to parse Port setting");
+
+static const char* const mdi_table[] = {
+ [ETH_TP_MDI_INVALID] = "unknown",
+ [ETH_TP_MDI] = "mdi",
+ [ETH_TP_MDI_X] = "mdi-x",
+ [ETH_TP_MDI_AUTO] = "auto",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(mdi, int);
+
+static const char* const netdev_feature_table[_NET_DEV_FEAT_MAX] = {
+ [NET_DEV_FEAT_SG] = "tx-scatter-gather",
+ [NET_DEV_FEAT_IP_CSUM] = "tx-checksum-ipv4",
+ [NET_DEV_FEAT_HW_CSUM] = "tx-checksum-ip-generic",
+ [NET_DEV_FEAT_IPV6_CSUM] = "tx-checksum-ipv6",
+ [NET_DEV_FEAT_HIGHDMA] = "highdma",
+ [NET_DEV_FEAT_FRAGLIST] = "tx-scatter-gather-fraglist",
+ [NET_DEV_FEAT_HW_VLAN_CTAG_TX] = "tx-vlan-hw-insert",
+ [NET_DEV_FEAT_HW_VLAN_CTAG_RX] = "rx-vlan-hw-parse",
+ [NET_DEV_FEAT_HW_VLAN_CTAG_FILTER] = "rx-vlan-filter",
+ [NET_DEV_FEAT_HW_VLAN_STAG_TX] = "tx-vlan-stag-hw-insert",
+ [NET_DEV_FEAT_HW_VLAN_STAG_RX] = "rx-vlan-stag-hw-parse",
+ [NET_DEV_FEAT_HW_VLAN_STAG_FILTER] = "rx-vlan-stag-filter",
+ [NET_DEV_FEAT_VLAN_CHALLENGED] = "vlan-challenged",
+ [NET_DEV_FEAT_GSO] = "tx-generic-segmentation",
+ [NET_DEV_FEAT_LLTX] = "tx-lockless",
+ [NET_DEV_FEAT_NETNS_LOCAL] = "netns-local",
+ [NET_DEV_FEAT_GRO] = "rx-gro",
+ [NET_DEV_FEAT_GRO_HW] = "rx-gro-hw",
+ [NET_DEV_FEAT_LRO] = "rx-lro",
+ [NET_DEV_FEAT_TSO] = "tx-tcp-segmentation",
+ [NET_DEV_FEAT_GSO_ROBUST] = "tx-gso-robust",
+ [NET_DEV_FEAT_TSO_ECN] = "tx-tcp-ecn-segmentation",
+ [NET_DEV_FEAT_TSO_MANGLEID] = "tx-tcp-mangleid-segmentation",
+ [NET_DEV_FEAT_TSO6] = "tx-tcp6-segmentation",
+ [NET_DEV_FEAT_FSO] = "tx-fcoe-segmentation",
+ [NET_DEV_FEAT_GSO_GRE] = "tx-gre-segmentation",
+ [NET_DEV_FEAT_GSO_GRE_CSUM] = "tx-gre-csum-segmentation",
+ [NET_DEV_FEAT_GSO_IPXIP4] = "tx-ipxip4-segmentation",
+ [NET_DEV_FEAT_GSO_IPXIP6] = "tx-ipxip6-segmentation",
+ [NET_DEV_FEAT_GSO_UDP_TUNNEL] = "tx-udp_tnl-segmentation",
+ [NET_DEV_FEAT_GSO_UDP_TUNNEL_CSUM] = "tx-udp_tnl-csum-segmentation",
+ [NET_DEV_FEAT_GSO_PARTIAL] = "tx-gso-partial",
+ [NET_DEV_FEAT_GSO_TUNNEL_REMCSUM] = "tx-tunnel-remcsum-segmentation",
+ [NET_DEV_FEAT_GSO_SCTP] = "tx-sctp-segmentation",
+ [NET_DEV_FEAT_GSO_ESP] = "tx-esp-segmentation",
+ [NET_DEV_FEAT_GSO_UDP_L4] = "tx-udp-segmentation",
+ [NET_DEV_FEAT_GSO_FRAGLIST] = "tx-gso-list",
+ [NET_DEV_FEAT_FCOE_CRC] = "tx-checksum-fcoe-crc",
+ [NET_DEV_FEAT_SCTP_CRC] = "tx-checksum-sctp",
+ [NET_DEV_FEAT_FCOE_MTU] = "fcoe-mtu",
+ [NET_DEV_FEAT_NTUPLE] = "rx-ntuple-filter",
+ [NET_DEV_FEAT_RXHASH] = "rx-hashing",
+ [NET_DEV_FEAT_RXCSUM] = "rx-checksum",
+ [NET_DEV_FEAT_NOCACHE_COPY] = "tx-nocache-copy",
+ [NET_DEV_FEAT_LOOPBACK] = "loopback",
+ [NET_DEV_FEAT_RXFCS] = "rx-fcs",
+ [NET_DEV_FEAT_RXALL] = "rx-all",
+ [NET_DEV_FEAT_HW_L2FW_DOFFLOAD] = "l2-fwd-offload",
+ [NET_DEV_FEAT_HW_TC] = "hw-tc-offload",
+ [NET_DEV_FEAT_HW_ESP] = "esp-hw-offload",
+ [NET_DEV_FEAT_HW_ESP_TX_CSUM] = "esp-tx-csum-hw-offload",
+ [NET_DEV_FEAT_RX_UDP_TUNNEL_PORT] = "rx-udp_tunnel-port-offload",
+ [NET_DEV_FEAT_HW_TLS_RECORD] = "tls-hw-record",
+ [NET_DEV_FEAT_HW_TLS_TX] = "tls-hw-tx-offload",
+ [NET_DEV_FEAT_HW_TLS_RX] = "tls-hw-rx-offload",
+ [NET_DEV_FEAT_GRO_FRAGLIST] = "rx-gro-list",
+ [NET_DEV_FEAT_HW_MACSEC] = "macsec-hw-offload",
+ [NET_DEV_FEAT_GRO_UDP_FWD] = "rx-udp-gro-forwarding",
+ [NET_DEV_FEAT_HW_HSR_TAG_INS] = "hsr-tag-ins-offload",
+ [NET_DEV_FEAT_HW_HSR_TAG_RM] = "hsr-tag-rm-offload",
+ [NET_DEV_FEAT_HW_HSR_FWD] = "hsr-fwd-offload",
+ [NET_DEV_FEAT_HW_HSR_DUP] = "hsr-dup-offload",
+
+ [NET_DEV_FEAT_TXCSUM] = "tx-checksum-", /* The suffix "-" means any feature beginning with "tx-checksum-" */
+};
+
+static const char* const ethtool_link_mode_bit_table[] = {
+# include "ethtool-link-mode.h"
+};
+/* Make sure the array is large enough to fit all bits */
+assert_cc((ELEMENTSOF(ethtool_link_mode_bit_table)-1) / 32 < N_ADVERTISE);
+
+DEFINE_STRING_TABLE_LOOKUP(ethtool_link_mode_bit, enum ethtool_link_mode_bit_indices);
+
+static int ethtool_connect(int *ethtool_fd) {
+ int fd;
+
+ assert(ethtool_fd);
+
+ /* This does nothing if already connected. */
+ if (*ethtool_fd >= 0)
+ return 0;
+
+ fd = socket_ioctl_fd();
+ if (fd < 0)
+ return log_debug_errno(fd, "ethtool: could not create control socket: %m");
+
+ *ethtool_fd = fd;
+ return 0;
+}
+
+int ethtool_get_driver(int *ethtool_fd, const char *ifname, char **ret) {
+ struct ethtool_drvinfo ecmd = {
+ .cmd = ETHTOOL_GDRVINFO,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ char *d;
+ int r;
+
+ assert(ethtool_fd);
+ assert(ifname);
+ assert(ret);
+
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (isempty(ecmd.driver))
+ return -ENODATA;
+
+ d = strdup(ecmd.driver);
+ if (!d)
+ return -ENOMEM;
+
+ *ret = d;
+ return 0;
+}
+
+int ethtool_get_link_info(
+ int *ethtool_fd,
+ const char *ifname,
+ int *ret_autonegotiation,
+ uint64_t *ret_speed,
+ Duplex *ret_duplex,
+ NetDevPort *ret_port) {
+
+ struct ethtool_cmd ecmd = {
+ .cmd = ETHTOOL_GSET,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ int r;
+
+ assert(ethtool_fd);
+ assert(ifname);
+
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (ret_autonegotiation)
+ *ret_autonegotiation = ecmd.autoneg;
+
+ if (ret_speed) {
+ uint32_t speed;
+
+ speed = ethtool_cmd_speed(&ecmd);
+ *ret_speed = speed == (uint32_t) SPEED_UNKNOWN ?
+ UINT64_MAX : (uint64_t) speed * 1000 * 1000;
+ }
+
+ if (ret_duplex)
+ *ret_duplex = ecmd.duplex;
+
+ if (ret_port)
+ *ret_port = ecmd.port;
+
+ return 0;
+}
+
+int ethtool_get_permanent_hw_addr(int *ethtool_fd, const char *ifname, struct hw_addr_data *ret) {
+ _cleanup_close_ int fd = -EBADF;
+ struct {
+ struct ethtool_perm_addr addr;
+ uint8_t space[HW_ADDR_MAX_SIZE];
+ } epaddr = {
+ .addr.cmd = ETHTOOL_GPERMADDR,
+ .addr.size = HW_ADDR_MAX_SIZE,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (caddr_t) &epaddr,
+ };
+ int r;
+
+ assert(ifname);
+ assert(ret);
+
+ if (!ethtool_fd)
+ ethtool_fd = &fd;
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (epaddr.addr.size == 0)
+ return -ENODATA;
+
+ if (epaddr.addr.size > HW_ADDR_MAX_SIZE)
+ return -EINVAL;
+
+ ret->length = epaddr.addr.size;
+ memcpy(ret->bytes, epaddr.addr.data, epaddr.addr.size);
+ return 0;
+}
+
+#define UPDATE(dest, val, updated) \
+ do { \
+ typeof(val) _v = (val); \
+ if (dest != _v) \
+ updated = true; \
+ dest = _v; \
+ } while (false)
+
+#define UPDATE_WITH_MAX(dest, max, val, updated) \
+ do { \
+ typeof(dest) _v = (val); \
+ typeof(dest) _max = (max); \
+ if (_v == 0 || _v > _max) \
+ _v = _max; \
+ if (dest != _v) \
+ updated = true; \
+ dest = _v; \
+ } while (false)
+
+int ethtool_set_wol(
+ int *ethtool_fd,
+ const char *ifname,
+ uint32_t wolopts,
+ const uint8_t password[SOPASS_MAX]) {
+
+ struct ethtool_wolinfo ecmd = {
+ .cmd = ETHTOOL_GWOL,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ bool need_update = false;
+ int r;
+
+ assert(ethtool_fd);
+ assert(ifname);
+
+ if (wolopts == UINT32_MAX && !password)
+ /* Nothing requested. Return earlier. */
+ return 0;
+
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ CLEANUP_ERASE(ecmd);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (wolopts == UINT32_MAX) {
+ /* When password is specified without valid WoL options specified, then enable
+ * WAKE_MAGICSECURE flag if supported. */
+ wolopts = ecmd.wolopts;
+ if (password && FLAGS_SET(ecmd.supported, WAKE_MAGICSECURE))
+ wolopts |= WAKE_MAGICSECURE;
+ }
+
+ if ((wolopts & ~ecmd.supported) != 0) {
+ _cleanup_free_ char *str = NULL;
+
+ (void) wol_options_to_string_alloc(wolopts & ~ecmd.supported, &str);
+ log_debug("Network interface %s does not support requested Wake on LAN options \"%s\", ignoring.",
+ ifname, strna(str));
+
+ wolopts &= ecmd.supported;
+ }
+
+ if (!FLAGS_SET(wolopts, WAKE_MAGICSECURE))
+ /* When WAKE_MAGICSECURE flag is not set, then ignore password. */
+ password = NULL;
+
+ UPDATE(ecmd.wolopts, wolopts, need_update);
+ if (password &&
+ memcmp(ecmd.sopass, password, sizeof(ecmd.sopass)) != 0) {
+ memcpy(ecmd.sopass, password, sizeof(ecmd.sopass));
+ need_update = true;
+ }
+
+ if (!need_update)
+ return 0;
+
+ ecmd.cmd = ETHTOOL_SWOL;
+ return RET_NERRNO(ioctl(*ethtool_fd, SIOCETHTOOL, &ifr));
+}
+
+int ethtool_set_nic_buffer_size(int *ethtool_fd, const char *ifname, const netdev_ring_param *ring) {
+ struct ethtool_ringparam ecmd = {
+ .cmd = ETHTOOL_GRINGPARAM,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ bool need_update = false;
+ int r;
+
+ assert(ethtool_fd);
+ assert(ifname);
+ assert(ring);
+
+ if (!ring->rx.set &&
+ !ring->rx_mini.set &&
+ !ring->rx_jumbo.set &&
+ !ring->tx.set)
+ return 0;
+
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (ring->rx.set)
+ UPDATE_WITH_MAX(ecmd.rx_pending, ecmd.rx_max_pending, ring->rx.value, need_update);
+
+ if (ring->rx_mini.set)
+ UPDATE_WITH_MAX(ecmd.rx_mini_pending, ecmd.rx_mini_max_pending, ring->rx_mini.value, need_update);
+
+ if (ring->rx_jumbo.set)
+ UPDATE_WITH_MAX(ecmd.rx_jumbo_pending, ecmd.rx_jumbo_max_pending, ring->rx_jumbo.value, need_update);
+
+ if (ring->tx.set)
+ UPDATE_WITH_MAX(ecmd.tx_pending, ecmd.tx_max_pending, ring->tx.value, need_update);
+
+ if (!need_update)
+ return 0;
+
+ ecmd.cmd = ETHTOOL_SRINGPARAM;
+ return RET_NERRNO(ioctl(*ethtool_fd, SIOCETHTOOL, &ifr));
+}
+
+static int get_stringset(int ethtool_fd, const char *ifname, enum ethtool_stringset stringset_id, struct ethtool_gstrings **ret) {
+ _cleanup_free_ struct ethtool_gstrings *strings = NULL;
+ struct {
+ struct ethtool_sset_info info;
+ uint32_t space;
+ } buffer = {
+ .info.cmd = ETHTOOL_GSSET_INFO,
+ .info.sset_mask = UINT64_C(1) << stringset_id,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &buffer,
+ };
+ uint32_t len;
+
+ assert(ethtool_fd >= 0);
+ assert(ifname);
+ assert(ret);
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (buffer.info.sset_mask == 0)
+ return -EOPNOTSUPP;
+
+#pragma GCC diagnostic push
+#if HAVE_ZERO_LENGTH_BOUNDS
+# pragma GCC diagnostic ignored "-Wzero-length-bounds"
+#endif
+ len = buffer.info.data[0];
+#pragma GCC diagnostic pop
+ if (len == 0)
+ return -EOPNOTSUPP;
+
+ strings = malloc0(offsetof(struct ethtool_gstrings, data) + len * ETH_GSTRING_LEN);
+ if (!strings)
+ return -ENOMEM;
+
+ strings->cmd = ETHTOOL_GSTRINGS;
+ strings->string_set = stringset_id;
+ strings->len = len;
+
+ ifr.ifr_data = (void*) strings;
+
+ if (ioctl(ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ *ret = TAKE_PTR(strings);
+ return 0;
+}
+
+static int get_features(int ethtool_fd, const char *ifname, uint32_t n_features, struct ethtool_gfeatures **ret) {
+ _cleanup_free_ struct ethtool_gfeatures *gfeatures = NULL;
+ struct ifreq ifr;
+
+ assert(ethtool_fd >= 0);
+ assert(ifname);
+ assert(ret);
+ assert(n_features > 0);
+
+ gfeatures = malloc0(offsetof(struct ethtool_gfeatures, features) +
+ DIV_ROUND_UP(n_features, 32U) * sizeof(gfeatures->features[0]));
+ if (!gfeatures)
+ return -ENOMEM;
+
+ gfeatures->cmd = ETHTOOL_GFEATURES;
+ gfeatures->size = DIV_ROUND_UP(n_features, 32U);
+
+ ifr = (struct ifreq) {
+ .ifr_data = (void*) gfeatures,
+ };
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ *ret = TAKE_PTR(gfeatures);
+ return 0;
+}
+
+static int set_features_bit(
+ const struct ethtool_gstrings *strings,
+ const struct ethtool_gfeatures *gfeatures,
+ struct ethtool_sfeatures *sfeatures,
+ const char *feature,
+ int flag) {
+
+ assert(strings);
+ assert(gfeatures);
+ assert(sfeatures);
+ assert(feature);
+
+ if (flag < 0)
+ return 0;
+
+ for (uint32_t i = 0; i < strings->len; i++) {
+ uint32_t block, mask;
+
+ if (!strneq((const char*) &strings->data[i * ETH_GSTRING_LEN], feature, ETH_GSTRING_LEN))
+ continue;
+
+ block = i / 32;
+ mask = UINT32_C(1) << (i % 32);
+
+ if (!FLAGS_SET(gfeatures->features[block].available, mask) ||
+ FLAGS_SET(gfeatures->features[block].never_changed, mask))
+ return -EOPNOTSUPP;
+
+ sfeatures->features[block].valid |= mask;
+ SET_FLAG(sfeatures->features[block].requested, mask, flag);
+
+ return 0;
+ }
+
+ return -ENODATA;
+}
+
+static int set_features_multiple_bit(
+ const struct ethtool_gstrings *strings,
+ const struct ethtool_gfeatures *gfeatures,
+ struct ethtool_sfeatures *sfeatures,
+ const char *feature,
+ int flag) {
+
+ bool found = false;
+ int r = -ENODATA;
+
+ assert(strings);
+ assert(gfeatures);
+ assert(sfeatures);
+ assert(feature);
+
+ if (flag < 0)
+ return 0;
+
+ for (uint32_t i = 0; i < strings->len; i++) {
+ uint32_t block, mask;
+
+ if (!startswith((const char*) &strings->data[i * ETH_GSTRING_LEN], feature))
+ continue;
+
+ block = i / 32;
+ mask = UINT32_C(1) << (i % 32);
+
+ if (!FLAGS_SET(gfeatures->features[block].available, mask) ||
+ FLAGS_SET(gfeatures->features[block].never_changed, mask)) {
+ r = -EOPNOTSUPP;
+ continue;
+ }
+
+ /* The flags is explicitly set by set_features_bit() */
+ if (FLAGS_SET(sfeatures->features[block].valid, mask))
+ continue;
+
+ sfeatures->features[block].valid |= mask;
+ SET_FLAG(sfeatures->features[block].requested, mask, flag);
+
+ found = true;
+ }
+
+ return found ? 0 : r;
+}
+
+int ethtool_set_features(int *ethtool_fd, const char *ifname, const int features[static _NET_DEV_FEAT_MAX]) {
+ _cleanup_free_ struct ethtool_gstrings *strings = NULL;
+ _cleanup_free_ struct ethtool_gfeatures *gfeatures = NULL;
+ _cleanup_free_ struct ethtool_sfeatures *sfeatures = NULL;
+ struct ifreq ifr;
+ bool have = false;
+ int r;
+
+ assert(ethtool_fd);
+ assert(ifname);
+ assert(features);
+
+ for (size_t i = 0; i < _NET_DEV_FEAT_MAX; i++)
+ if (features[i] >= 0) {
+ have = true;
+ break;
+ }
+
+ if (!have)
+ return 0;
+
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ r = get_stringset(*ethtool_fd, ifname, ETH_SS_FEATURES, &strings);
+ if (r < 0)
+ return log_debug_errno(r, "ethtool: could not get ethtool feature strings: %m");
+
+ r = get_features(*ethtool_fd, ifname, strings->len, &gfeatures);
+ if (r < 0)
+ return log_debug_errno(r, "ethtool: could not get ethtool features for %s: %m", ifname);
+
+ sfeatures = malloc0(offsetof(struct ethtool_sfeatures, features) +
+ DIV_ROUND_UP(strings->len, 32U) * sizeof(sfeatures->features[0]));
+ if (!sfeatures)
+ return log_oom_debug();
+
+ sfeatures->cmd = ETHTOOL_SFEATURES;
+ sfeatures->size = DIV_ROUND_UP(strings->len, 32U);
+
+ for (size_t i = 0; i < _NET_DEV_FEAT_SIMPLE_MAX; i++) {
+ r = set_features_bit(strings, gfeatures, sfeatures, netdev_feature_table[i], features[i]);
+ if (r < 0)
+ log_debug_errno(r, "ethtool: could not set feature %s for %s, ignoring: %m", netdev_feature_table[i], ifname);
+ }
+
+ for (size_t i = _NET_DEV_FEAT_SIMPLE_MAX; i < _NET_DEV_FEAT_MAX; i++) {
+ r = set_features_multiple_bit(strings, gfeatures, sfeatures, netdev_feature_table[i], features[i]);
+ if (r < 0)
+ log_debug_errno(r, "ethtool: could not set feature %s for %s, ignoring: %m", netdev_feature_table[i], ifname);
+ }
+
+ ifr = (struct ifreq) {
+ .ifr_data = (void*) sfeatures,
+ };
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return log_debug_errno(errno, "ethtool: could not set ethtool features for %s", ifname);
+
+ return 0;
+}
+
+static int get_glinksettings(int fd, struct ifreq *ifr, struct ethtool_link_usettings **ret) {
+ struct ecmd {
+ struct ethtool_link_settings req;
+ uint32_t link_mode_data[3 * ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+ } ecmd = {
+ .req.cmd = ETHTOOL_GLINKSETTINGS,
+ };
+ struct ethtool_link_usettings *u;
+ unsigned offset;
+
+ assert(fd >= 0);
+ assert(ifr);
+ assert(ret);
+
+ /* The interaction user/kernel via the new API requires a small ETHTOOL_GLINKSETTINGS
+ handshake first to agree on the length of the link mode bitmaps. If kernel doesn't
+ agree with user, it returns the bitmap length it is expecting from user as a negative
+ length (and cmd field is 0). When kernel and user agree, kernel returns valid info in
+ all fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS). Based on
+ https://github.com/torvalds/linux/commit/3f1ac7a700d039c61d8d8b99f28d605d489a60cf
+ */
+
+ ifr->ifr_data = (void *) &ecmd;
+
+ if (ioctl(fd, SIOCETHTOOL, ifr) < 0)
+ return -errno;
+
+ if (ecmd.req.link_mode_masks_nwords >= 0 || ecmd.req.cmd != ETHTOOL_GLINKSETTINGS)
+ return -EOPNOTSUPP;
+
+ ecmd.req.link_mode_masks_nwords = -ecmd.req.link_mode_masks_nwords;
+
+ ifr->ifr_data = (void *) &ecmd;
+
+ if (ioctl(fd, SIOCETHTOOL, ifr) < 0)
+ return -errno;
+
+ if (ecmd.req.link_mode_masks_nwords <= 0 || ecmd.req.cmd != ETHTOOL_GLINKSETTINGS)
+ return -EOPNOTSUPP;
+
+ u = new(struct ethtool_link_usettings, 1);
+ if (!u)
+ return -ENOMEM;
+
+ *u = (struct ethtool_link_usettings) {
+ .base = ecmd.req,
+ };
+
+ offset = 0;
+ memcpy(u->link_modes.supported, &ecmd.link_mode_data[offset], 4 * ecmd.req.link_mode_masks_nwords);
+
+ offset += ecmd.req.link_mode_masks_nwords;
+ memcpy(u->link_modes.advertising, &ecmd.link_mode_data[offset], 4 * ecmd.req.link_mode_masks_nwords);
+
+ offset += ecmd.req.link_mode_masks_nwords;
+ memcpy(u->link_modes.lp_advertising, &ecmd.link_mode_data[offset], 4 * ecmd.req.link_mode_masks_nwords);
+
+ *ret = u;
+
+ return 0;
+}
+
+static int get_gset(int fd, struct ifreq *ifr, struct ethtool_link_usettings **ret) {
+ struct ethtool_link_usettings *e;
+ struct ethtool_cmd ecmd = {
+ .cmd = ETHTOOL_GSET,
+ };
+
+ assert(fd >= 0);
+ assert(ifr);
+ assert(ret);
+
+ ifr->ifr_data = (void *) &ecmd;
+
+ if (ioctl(fd, SIOCETHTOOL, ifr) < 0)
+ return -errno;
+
+ e = new(struct ethtool_link_usettings, 1);
+ if (!e)
+ return -ENOMEM;
+
+ *e = (struct ethtool_link_usettings) {
+ .base.cmd = ETHTOOL_GSET,
+ .base.link_mode_masks_nwords = 1,
+ .base.speed = ethtool_cmd_speed(&ecmd),
+ .base.duplex = ecmd.duplex,
+ .base.port = ecmd.port,
+ .base.phy_address = ecmd.phy_address,
+ .base.autoneg = ecmd.autoneg,
+ .base.mdio_support = ecmd.mdio_support,
+ .base.eth_tp_mdix = ecmd.eth_tp_mdix,
+ .base.eth_tp_mdix_ctrl = ecmd.eth_tp_mdix_ctrl,
+
+ .link_modes.supported[0] = ecmd.supported,
+ .link_modes.advertising[0] = ecmd.advertising,
+ .link_modes.lp_advertising[0] = ecmd.lp_advertising,
+ };
+
+ *ret = e;
+
+ return 0;
+}
+
+static int set_slinksettings(int fd, struct ifreq *ifr, const struct ethtool_link_usettings *u) {
+ struct {
+ struct ethtool_link_settings req;
+ uint32_t link_mode_data[3 * ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+ } ecmd = {};
+ unsigned offset;
+
+ assert(fd >= 0);
+ assert(ifr);
+ assert(u);
+
+ if (u->base.cmd != ETHTOOL_GLINKSETTINGS || u->base.link_mode_masks_nwords <= 0)
+ return -EINVAL;
+
+ ecmd.req = u->base;
+ ecmd.req.cmd = ETHTOOL_SLINKSETTINGS;
+ offset = 0;
+ memcpy(&ecmd.link_mode_data[offset], u->link_modes.supported, 4 * ecmd.req.link_mode_masks_nwords);
+
+ offset += ecmd.req.link_mode_masks_nwords;
+ memcpy(&ecmd.link_mode_data[offset], u->link_modes.advertising, 4 * ecmd.req.link_mode_masks_nwords);
+
+ offset += ecmd.req.link_mode_masks_nwords;
+ memcpy(&ecmd.link_mode_data[offset], u->link_modes.lp_advertising, 4 * ecmd.req.link_mode_masks_nwords);
+
+ ifr->ifr_data = (void *) &ecmd;
+
+ return RET_NERRNO(ioctl(fd, SIOCETHTOOL, ifr));
+}
+
+static int set_sset(int fd, struct ifreq *ifr, const struct ethtool_link_usettings *u) {
+ struct ethtool_cmd ecmd = {
+ .cmd = ETHTOOL_SSET,
+ };
+
+ assert(fd >= 0);
+ assert(ifr);
+ assert(u);
+
+ if (u->base.cmd != ETHTOOL_GSET || u->base.link_mode_masks_nwords <= 0)
+ return -EINVAL;
+
+ ecmd.supported = u->link_modes.supported[0];
+ ecmd.advertising = u->link_modes.advertising[0];
+ ecmd.lp_advertising = u->link_modes.lp_advertising[0];
+
+ ethtool_cmd_speed_set(&ecmd, u->base.speed);
+
+ ecmd.duplex = u->base.duplex;
+ ecmd.port = u->base.port;
+ ecmd.phy_address = u->base.phy_address;
+ ecmd.autoneg = u->base.autoneg;
+ ecmd.mdio_support = u->base.mdio_support;
+ ecmd.eth_tp_mdix = u->base.eth_tp_mdix;
+ ecmd.eth_tp_mdix_ctrl = u->base.eth_tp_mdix_ctrl;
+
+ ifr->ifr_data = (void *) &ecmd;
+
+ return RET_NERRNO(ioctl(fd, SIOCETHTOOL, ifr));
+}
+
+int ethtool_set_glinksettings(
+ int *fd,
+ const char *ifname,
+ int autonegotiation,
+ const uint32_t advertise[static N_ADVERTISE],
+ uint64_t speed,
+ Duplex duplex,
+ NetDevPort port,
+ uint8_t mdi) {
+
+ _cleanup_free_ struct ethtool_link_usettings *u = NULL;
+ struct ifreq ifr = {};
+ bool changed = false;
+ int r;
+
+ assert(fd);
+ assert(ifname);
+ assert(advertise);
+
+ if (autonegotiation < 0 && memeqzero(advertise, sizeof(uint32_t) * N_ADVERTISE) &&
+ speed == 0 && duplex < 0 && port < 0 && mdi == ETH_TP_MDI_INVALID)
+ return 0;
+
+ /* If autonegotiation is disabled, the speed and duplex represent the fixed link mode and are
+ * writable if the driver supports multiple link modes. If it is enabled then they are
+ * read-only. If the link is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and @duplex is %DUPLEX_UNKNOWN
+ * or the best enabled duplex mode. */
+
+ if (speed > 0 || duplex >= 0 || port >= 0) {
+ if (autonegotiation == AUTONEG_ENABLE || !memeqzero(advertise, sizeof(uint32_t) * N_ADVERTISE)) {
+ log_debug("ethtool: autonegotiation is enabled, ignoring speed, duplex, or port settings.");
+ speed = 0;
+ duplex = _DUP_INVALID;
+ port = _NET_DEV_PORT_INVALID;
+ } else {
+ log_debug("ethtool: setting speed, duplex, or port, disabling autonegotiation.");
+ autonegotiation = AUTONEG_DISABLE;
+ }
+ }
+
+ r = ethtool_connect(fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ r = get_glinksettings(*fd, &ifr, &u);
+ if (r < 0) {
+ r = get_gset(*fd, &ifr, &u);
+ if (r < 0)
+ return log_debug_errno(r, "ethtool: Cannot get device settings for %s: %m", ifname);
+ }
+
+ if (speed > 0)
+ UPDATE(u->base.speed, DIV_ROUND_UP(speed, 1000000), changed);
+
+ if (duplex >= 0)
+ UPDATE(u->base.duplex, duplex, changed);
+
+ if (port >= 0)
+ UPDATE(u->base.port, port, changed);
+
+ if (autonegotiation >= 0)
+ UPDATE(u->base.autoneg, autonegotiation, changed);
+
+ if (!memeqzero(advertise, sizeof(uint32_t) * N_ADVERTISE)) {
+ UPDATE(u->base.autoneg, AUTONEG_ENABLE, changed);
+
+ changed = changed ||
+ memcmp(&u->link_modes.advertising, advertise, sizeof(uint32_t) * N_ADVERTISE) != 0 ||
+ !memeqzero((uint8_t*) &u->link_modes.advertising + sizeof(uint32_t) * N_ADVERTISE,
+ ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES - sizeof(uint32_t) * N_ADVERTISE);
+ memcpy(&u->link_modes.advertising, advertise, sizeof(uint32_t) * N_ADVERTISE);
+ memzero((uint8_t*) &u->link_modes.advertising + sizeof(uint32_t) * N_ADVERTISE,
+ ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES - sizeof(uint32_t) * N_ADVERTISE);
+ }
+
+ if (mdi != ETH_TP_MDI_INVALID) {
+ if (u->base.eth_tp_mdix_ctrl == ETH_TP_MDI_INVALID)
+ log_debug("ethtool: setting MDI not supported for %s, ignoring.", ifname);
+ else
+ UPDATE(u->base.eth_tp_mdix_ctrl, mdi, changed);
+ }
+
+ if (!changed)
+ return 0;
+
+ if (u->base.cmd == ETHTOOL_GLINKSETTINGS)
+ r = set_slinksettings(*fd, &ifr, u);
+ else
+ r = set_sset(*fd, &ifr, u);
+ if (r < 0)
+ return log_debug_errno(r, "ethtool: Cannot set device settings for %s: %m", ifname);
+
+ return r;
+}
+
+int ethtool_set_channels(int *fd, const char *ifname, const netdev_channels *channels) {
+ struct ethtool_channels ecmd = {
+ .cmd = ETHTOOL_GCHANNELS,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ bool need_update = false;
+ int r;
+
+ assert(fd);
+ assert(ifname);
+ assert(channels);
+
+ if (!channels->rx.set &&
+ !channels->tx.set &&
+ !channels->other.set &&
+ !channels->combined.set)
+ return 0;
+
+ r = ethtool_connect(fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (channels->rx.set)
+ UPDATE_WITH_MAX(ecmd.rx_count, ecmd.max_rx, channels->rx.value, need_update);
+
+ if (channels->tx.set)
+ UPDATE_WITH_MAX(ecmd.tx_count, ecmd.max_tx, channels->tx.value, need_update);
+
+ if (channels->other.set)
+ UPDATE_WITH_MAX(ecmd.other_count, ecmd.max_other, channels->other.value, need_update);
+
+ if (channels->combined.set)
+ UPDATE_WITH_MAX(ecmd.combined_count, ecmd.max_combined, channels->combined.value, need_update);
+
+ if (!need_update)
+ return 0;
+
+ ecmd.cmd = ETHTOOL_SCHANNELS;
+ return RET_NERRNO(ioctl(*fd, SIOCETHTOOL, &ifr));
+}
+
+int ethtool_set_flow_control(int *fd, const char *ifname, int rx, int tx, int autoneg) {
+ struct ethtool_pauseparam ecmd = {
+ .cmd = ETHTOOL_GPAUSEPARAM,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ bool need_update = false;
+ int r;
+
+ assert(fd);
+ assert(ifname);
+
+ if (rx < 0 && tx < 0 && autoneg < 0)
+ return 0;
+
+ r = ethtool_connect(fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (rx >= 0)
+ UPDATE(ecmd.rx_pause, (uint32_t) rx, need_update);
+
+ if (tx >= 0)
+ UPDATE(ecmd.tx_pause, (uint32_t) tx, need_update);
+
+ if (autoneg >= 0)
+ UPDATE(ecmd.autoneg, (uint32_t) autoneg, need_update);
+
+ if (!need_update)
+ return 0;
+
+ ecmd.cmd = ETHTOOL_SPAUSEPARAM;
+ return RET_NERRNO(ioctl(*fd, SIOCETHTOOL, &ifr));
+}
+
+int ethtool_set_nic_coalesce_settings(int *ethtool_fd, const char *ifname, const netdev_coalesce_param *coalesce) {
+ struct ethtool_coalesce ecmd = {
+ .cmd = ETHTOOL_GCOALESCE,
+ };
+ struct ifreq ifr = {
+ .ifr_data = (void*) &ecmd,
+ };
+ bool need_update = false;
+ int r;
+
+ assert(ethtool_fd);
+ assert(ifname);
+ assert(coalesce);
+
+ if (coalesce->use_adaptive_rx_coalesce < 0 &&
+ coalesce->use_adaptive_tx_coalesce < 0 &&
+ !coalesce->rx_coalesce_usecs.set &&
+ !coalesce->rx_max_coalesced_frames.set &&
+ !coalesce->rx_coalesce_usecs_irq.set &&
+ !coalesce->rx_max_coalesced_frames_irq.set &&
+ !coalesce->tx_coalesce_usecs.set &&
+ !coalesce->tx_max_coalesced_frames.set &&
+ !coalesce->tx_coalesce_usecs_irq.set &&
+ !coalesce->tx_max_coalesced_frames_irq.set &&
+ !coalesce->stats_block_coalesce_usecs.set &&
+ !coalesce->pkt_rate_low.set &&
+ !coalesce->rx_coalesce_usecs_low.set &&
+ !coalesce->rx_max_coalesced_frames_low.set &&
+ !coalesce->tx_coalesce_usecs_low.set &&
+ !coalesce->tx_max_coalesced_frames_low.set &&
+ !coalesce->pkt_rate_high.set &&
+ !coalesce->rx_coalesce_usecs_high.set &&
+ !coalesce->rx_max_coalesced_frames_high.set &&
+ !coalesce->tx_coalesce_usecs_high.set &&
+ !coalesce->tx_max_coalesced_frames_high.set &&
+ !coalesce->rate_sample_interval.set)
+ return 0;
+
+ r = ethtool_connect(ethtool_fd);
+ if (r < 0)
+ return r;
+
+ strscpy(ifr.ifr_name, sizeof(ifr.ifr_name), ifname);
+
+ if (ioctl(*ethtool_fd, SIOCETHTOOL, &ifr) < 0)
+ return -errno;
+
+ if (coalesce->use_adaptive_rx_coalesce >= 0)
+ UPDATE(ecmd.use_adaptive_rx_coalesce, (uint32_t) coalesce->use_adaptive_rx_coalesce, need_update);
+
+ if (coalesce->use_adaptive_tx_coalesce >= 0)
+ UPDATE(ecmd.use_adaptive_tx_coalesce, (uint32_t) coalesce->use_adaptive_tx_coalesce, need_update);
+
+ if (coalesce->rx_coalesce_usecs.set)
+ UPDATE(ecmd.rx_coalesce_usecs, coalesce->rx_coalesce_usecs.value, need_update);
+
+ if (coalesce->rx_max_coalesced_frames.set)
+ UPDATE(ecmd.rx_max_coalesced_frames, coalesce->rx_max_coalesced_frames.value, need_update);
+
+ if (coalesce->rx_coalesce_usecs_irq.set)
+ UPDATE(ecmd.rx_coalesce_usecs_irq, coalesce->rx_coalesce_usecs_irq.value, need_update);
+
+ if (coalesce->rx_max_coalesced_frames_irq.set)
+ UPDATE(ecmd.rx_max_coalesced_frames_irq, coalesce->rx_max_coalesced_frames_irq.value, need_update);
+
+ if (coalesce->tx_coalesce_usecs.set)
+ UPDATE(ecmd.tx_coalesce_usecs, coalesce->tx_coalesce_usecs.value, need_update);
+
+ if (coalesce->tx_max_coalesced_frames.set)
+ UPDATE(ecmd.tx_max_coalesced_frames, coalesce->tx_max_coalesced_frames.value, need_update);
+
+ if (coalesce->tx_coalesce_usecs_irq.set)
+ UPDATE(ecmd.tx_coalesce_usecs_irq, coalesce->tx_coalesce_usecs_irq.value, need_update);
+
+ if (coalesce->tx_max_coalesced_frames_irq.set)
+ UPDATE(ecmd.tx_max_coalesced_frames_irq, coalesce->tx_max_coalesced_frames_irq.value, need_update);
+
+ if (coalesce->stats_block_coalesce_usecs.set)
+ UPDATE(ecmd.stats_block_coalesce_usecs, coalesce->stats_block_coalesce_usecs.value, need_update);
+
+ if (coalesce->pkt_rate_low.set)
+ UPDATE(ecmd.pkt_rate_low, coalesce->pkt_rate_low.value, need_update);
+
+ if (coalesce->rx_coalesce_usecs_low.set)
+ UPDATE(ecmd.rx_coalesce_usecs_low, coalesce->rx_coalesce_usecs_low.value, need_update);
+
+ if (coalesce->rx_max_coalesced_frames_low.set)
+ UPDATE(ecmd.rx_max_coalesced_frames_low, coalesce->rx_max_coalesced_frames_low.value, need_update);
+
+ if (coalesce->tx_coalesce_usecs_low.set)
+ UPDATE(ecmd.tx_coalesce_usecs_low, coalesce->tx_coalesce_usecs_low.value, need_update);
+
+ if (coalesce->tx_max_coalesced_frames_low.set)
+ UPDATE(ecmd.tx_max_coalesced_frames_low, coalesce->tx_max_coalesced_frames_low.value, need_update);
+
+ if (coalesce->pkt_rate_high.set)
+ UPDATE(ecmd.pkt_rate_high, coalesce->pkt_rate_high.value, need_update);
+
+ if (coalesce->rx_coalesce_usecs_high.set)
+ UPDATE(ecmd.rx_coalesce_usecs_high, coalesce->rx_coalesce_usecs_high.value, need_update);
+
+ if (coalesce->rx_max_coalesced_frames_high.set)
+ UPDATE(ecmd.rx_max_coalesced_frames_high, coalesce->rx_max_coalesced_frames_high.value, need_update);
+
+ if (coalesce->tx_coalesce_usecs_high.set)
+ UPDATE(ecmd.tx_coalesce_usecs_high, coalesce->tx_coalesce_usecs_high.value, need_update);
+
+ if (coalesce->tx_max_coalesced_frames_high.set)
+ UPDATE(ecmd.tx_max_coalesced_frames_high, coalesce->tx_max_coalesced_frames_high.value, need_update);
+
+ if (coalesce->rate_sample_interval.set)
+ UPDATE(ecmd.rate_sample_interval, DIV_ROUND_UP(coalesce->rate_sample_interval.value, USEC_PER_SEC), need_update);
+
+ if (!need_update)
+ return 0;
+
+ ecmd.cmd = ETHTOOL_SCOALESCE;
+ return RET_NERRNO(ioctl(*ethtool_fd, SIOCETHTOOL, &ifr));
+}
+
+int config_parse_advertise(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint32_t *advertise = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(section);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ /* Empty string resets the value. */
+ memzero(advertise, sizeof(uint32_t) * N_ADVERTISE);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *w = NULL;
+ enum ethtool_link_mode_bit_indices mode;
+
+ r = extract_first_word(&p, &w, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to split advertise modes '%s', ignoring assignment: %m", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ mode = ethtool_link_mode_bit_from_string(w);
+ /* We reuse the kernel provided enum which does not contain negative value. So, the cast
+ * below is mandatory. Otherwise, the check below always passes and access an invalid address. */
+ if ((int) mode < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, mode,
+ "Failed to parse advertise mode, ignoring: %s", w);
+ continue;
+ }
+
+ advertise[mode / 32] |= 1UL << (mode % 32);
+ }
+}
+
+int config_parse_mdi(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint8_t *mdi = ASSERT_PTR(data);
+
+ assert(filename);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *mdi = ETH_TP_MDI_INVALID;
+ return 0;
+ }
+
+ if (STR_IN_SET(rvalue, "mdi", "straight")) {
+ *mdi = ETH_TP_MDI;
+ return 0;
+ }
+
+ if (STR_IN_SET(rvalue, "mdi-x", "mdix", "crossover")) {
+ *mdi = ETH_TP_MDI_X;
+ return 0;
+ }
+
+ if (streq(rvalue, "auto")) {
+ *mdi = ETH_TP_MDI_AUTO;
+ return 0;
+ }
+
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Failed to parse %s= setting, ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+}
+
+int config_parse_ring_buffer_or_channel(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ u32_opt *dst = ASSERT_PTR(data);
+ uint32_t k;
+ int r;
+
+ assert(filename);
+ assert(section);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ dst->value = 0;
+ dst->set = false;
+ return 0;
+ }
+
+ if (streq(rvalue, "max")) {
+ dst->value = 0;
+ dst->set = true;
+ return 0;
+ }
+
+ r = safe_atou32(rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+ if (k < 1) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid %s= value, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ dst->value = k;
+ dst->set = true;
+ return 0;
+}
+
+int config_parse_wol(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint32_t new_opts = 0, *opts = data;
+ int r;
+
+ assert(filename);
+ assert(section);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ if (isempty(rvalue)) {
+ *opts = UINT32_MAX; /* Do not update WOL option. */
+ return 0;
+ }
+
+ if (streq(rvalue, "off")) {
+ *opts = 0; /* Disable WOL. */
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *w = NULL;
+ bool found = false;
+
+ r = extract_first_word(&p, &w, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to split wake-on-lan modes '%s', ignoring assignment: %m", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ break;
+
+ for (size_t i = 0; i < ELEMENTSOF(wol_option_map); i++)
+ if (streq(w, wol_option_map[i].name)) {
+ new_opts |= wol_option_map[i].opt;
+ found = true;
+ break;
+ }
+
+ if (!found)
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Unknown wake-on-lan mode '%s', ignoring.", w);
+ }
+
+ if (*opts == UINT32_MAX)
+ *opts = new_opts;
+ else
+ *opts |= new_opts;
+
+ return 0;
+}
+
+int config_parse_coalesce_u32(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ u32_opt *dst = data;
+ uint32_t k;
+ int r;
+
+ if (isempty(rvalue)) {
+ dst->value = 0;
+ dst->set = false;
+ return 0;
+ }
+
+ r = safe_atou32(rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ dst->value = k;
+ dst->set = true;
+ return 0;
+}
+
+int config_parse_coalesce_sec(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ u32_opt *dst = data;
+ usec_t usec;
+ int r;
+
+ if (isempty(rvalue)) {
+ dst->value = 0;
+ dst->set = false;
+ return 0;
+ }
+
+ r = parse_sec(rvalue, &usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse coalesce setting value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (usec > UINT32_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Too large %s= value, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (STR_IN_SET(lvalue, "StatisticsBlockCoalesceSec", "CoalescePacketRateSampleIntervalSec") && usec < 1) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid %s= value, ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ dst->value = (uint32_t) usec;
+ dst->set = true;
+
+ return 0;
+}
diff --git a/src/shared/ethtool-util.h b/src/shared/ethtool-util.h
new file mode 100644
index 0000000..5303cd7
--- /dev/null
+++ b/src/shared/ethtool-util.h
@@ -0,0 +1,205 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <macro.h>
+#include <net/ethernet.h>
+#include <linux/ethtool.h>
+
+#include "conf-parser.h"
+#include "ether-addr-util.h"
+
+#define N_ADVERTISE 4
+
+/* we can't use DUPLEX_ prefix, as it
+ * clashes with <linux/ethtool.h> */
+typedef enum Duplex {
+ DUP_HALF = DUPLEX_HALF,
+ DUP_FULL = DUPLEX_FULL,
+ _DUP_MAX,
+ _DUP_INVALID = -EINVAL,
+} Duplex;
+
+typedef enum NetDevFeature {
+ NET_DEV_FEAT_SG,
+ NET_DEV_FEAT_IP_CSUM,
+ NET_DEV_FEAT_HW_CSUM,
+ NET_DEV_FEAT_IPV6_CSUM,
+ NET_DEV_FEAT_HIGHDMA,
+ NET_DEV_FEAT_FRAGLIST,
+ NET_DEV_FEAT_HW_VLAN_CTAG_TX,
+ NET_DEV_FEAT_HW_VLAN_CTAG_RX,
+ NET_DEV_FEAT_HW_VLAN_CTAG_FILTER,
+ NET_DEV_FEAT_HW_VLAN_STAG_TX,
+ NET_DEV_FEAT_HW_VLAN_STAG_RX,
+ NET_DEV_FEAT_HW_VLAN_STAG_FILTER,
+ NET_DEV_FEAT_VLAN_CHALLENGED,
+ NET_DEV_FEAT_GSO,
+ NET_DEV_FEAT_LLTX,
+ NET_DEV_FEAT_NETNS_LOCAL,
+ NET_DEV_FEAT_GRO,
+ NET_DEV_FEAT_GRO_HW,
+ NET_DEV_FEAT_LRO,
+ NET_DEV_FEAT_TSO,
+ NET_DEV_FEAT_GSO_ROBUST,
+ NET_DEV_FEAT_TSO_ECN,
+ NET_DEV_FEAT_TSO_MANGLEID,
+ NET_DEV_FEAT_TSO6,
+ NET_DEV_FEAT_FSO,
+ NET_DEV_FEAT_GSO_GRE,
+ NET_DEV_FEAT_GSO_GRE_CSUM,
+ NET_DEV_FEAT_GSO_IPXIP4,
+ NET_DEV_FEAT_GSO_IPXIP6,
+ NET_DEV_FEAT_GSO_UDP_TUNNEL,
+ NET_DEV_FEAT_GSO_UDP_TUNNEL_CSUM,
+ NET_DEV_FEAT_GSO_PARTIAL,
+ NET_DEV_FEAT_GSO_TUNNEL_REMCSUM,
+ NET_DEV_FEAT_GSO_SCTP,
+ NET_DEV_FEAT_GSO_ESP,
+ NET_DEV_FEAT_GSO_UDP_L4,
+ NET_DEV_FEAT_GSO_FRAGLIST,
+ NET_DEV_FEAT_FCOE_CRC,
+ NET_DEV_FEAT_SCTP_CRC,
+ NET_DEV_FEAT_FCOE_MTU,
+ NET_DEV_FEAT_NTUPLE,
+ NET_DEV_FEAT_RXHASH,
+ NET_DEV_FEAT_RXCSUM,
+ NET_DEV_FEAT_NOCACHE_COPY,
+ NET_DEV_FEAT_LOOPBACK,
+ NET_DEV_FEAT_RXFCS,
+ NET_DEV_FEAT_RXALL,
+ NET_DEV_FEAT_HW_L2FW_DOFFLOAD,
+ NET_DEV_FEAT_HW_TC,
+ NET_DEV_FEAT_HW_ESP,
+ NET_DEV_FEAT_HW_ESP_TX_CSUM,
+ NET_DEV_FEAT_RX_UDP_TUNNEL_PORT,
+ NET_DEV_FEAT_HW_TLS_RECORD,
+ NET_DEV_FEAT_HW_TLS_TX,
+ NET_DEV_FEAT_HW_TLS_RX,
+ NET_DEV_FEAT_GRO_FRAGLIST,
+ NET_DEV_FEAT_HW_MACSEC,
+ NET_DEV_FEAT_GRO_UDP_FWD,
+ NET_DEV_FEAT_HW_HSR_TAG_INS,
+ NET_DEV_FEAT_HW_HSR_TAG_RM,
+ NET_DEV_FEAT_HW_HSR_FWD,
+ NET_DEV_FEAT_HW_HSR_DUP,
+ _NET_DEV_FEAT_SIMPLE_MAX,
+
+ NET_DEV_FEAT_TXCSUM = _NET_DEV_FEAT_SIMPLE_MAX,
+ _NET_DEV_FEAT_MAX,
+ _NET_DEV_FEAT_INVALID = -EINVAL,
+} NetDevFeature;
+
+typedef enum NetDevPort {
+ NET_DEV_PORT_TP = PORT_TP,
+ NET_DEV_PORT_AUI = PORT_AUI,
+ NET_DEV_PORT_MII = PORT_MII,
+ NET_DEV_PORT_FIBRE = PORT_FIBRE,
+ NET_DEV_PORT_BNC = PORT_BNC,
+ NET_DEV_PORT_DA = PORT_DA,
+ NET_DEV_PORT_NONE = PORT_NONE,
+ NET_DEV_PORT_OTHER = PORT_OTHER,
+ _NET_DEV_PORT_MAX,
+ _NET_DEV_PORT_INVALID = -EINVAL,
+} NetDevPort;
+
+#define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32 (SCHAR_MAX)
+#define ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NBYTES (4 * ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+ struct ethtool_link_settings base;
+
+ struct {
+ uint32_t supported[ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+ uint32_t advertising[ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+ uint32_t lp_advertising[ETHTOOL_LINK_MODE_MASK_MAX_KERNEL_NU32];
+ } link_modes;
+};
+
+typedef struct u32_opt {
+ uint32_t value; /* a value of 0 indicates the hardware advertised maximum should be used. */
+ bool set;
+} u32_opt;
+
+typedef struct netdev_channels {
+ u32_opt rx;
+ u32_opt tx;
+ u32_opt other;
+ u32_opt combined;
+} netdev_channels;
+
+typedef struct netdev_ring_param {
+ u32_opt rx;
+ u32_opt rx_mini;
+ u32_opt rx_jumbo;
+ u32_opt tx;
+} netdev_ring_param;
+
+typedef struct netdev_coalesce_param {
+ u32_opt rx_coalesce_usecs;
+ u32_opt rx_max_coalesced_frames;
+ u32_opt rx_coalesce_usecs_irq;
+ u32_opt rx_max_coalesced_frames_irq;
+ u32_opt tx_coalesce_usecs;
+ u32_opt tx_max_coalesced_frames;
+ u32_opt tx_coalesce_usecs_irq;
+ u32_opt tx_max_coalesced_frames_irq;
+ u32_opt stats_block_coalesce_usecs;
+ int use_adaptive_rx_coalesce;
+ int use_adaptive_tx_coalesce;
+ u32_opt pkt_rate_low;
+ u32_opt rx_coalesce_usecs_low;
+ u32_opt rx_max_coalesced_frames_low;
+ u32_opt tx_coalesce_usecs_low;
+ u32_opt tx_max_coalesced_frames_low;
+ u32_opt pkt_rate_high;
+ u32_opt rx_coalesce_usecs_high;
+ u32_opt rx_max_coalesced_frames_high;
+ u32_opt tx_coalesce_usecs_high;
+ u32_opt tx_max_coalesced_frames_high;
+ u32_opt rate_sample_interval;
+} netdev_coalesce_param;
+
+int ethtool_get_driver(int *ethtool_fd, const char *ifname, char **ret);
+int ethtool_get_link_info(int *ethtool_fd, const char *ifname,
+ int *ret_autonegotiation, uint64_t *ret_speed,
+ Duplex *ret_duplex, NetDevPort *ret_port);
+int ethtool_get_permanent_hw_addr(int *ethtool_fd, const char *ifname, struct hw_addr_data *ret);
+int ethtool_set_wol(int *ethtool_fd, const char *ifname, uint32_t wolopts, const uint8_t password[SOPASS_MAX]);
+int ethtool_set_nic_buffer_size(int *ethtool_fd, const char *ifname, const netdev_ring_param *ring);
+int ethtool_set_features(int *ethtool_fd, const char *ifname, const int features[static _NET_DEV_FEAT_MAX]);
+int ethtool_set_glinksettings(
+ int *fd,
+ const char *ifname,
+ int autonegotiation,
+ const uint32_t advertise[static N_ADVERTISE],
+ uint64_t speed,
+ Duplex duplex,
+ NetDevPort port,
+ uint8_t mdi);
+int ethtool_set_channels(int *ethtool_fd, const char *ifname, const netdev_channels *channels);
+int ethtool_set_flow_control(int *fd, const char *ifname, int rx, int tx, int autoneg);
+int ethtool_set_nic_coalesce_settings(int *ethtool_fd, const char *ifname, const netdev_coalesce_param *coalesce);
+
+const char *duplex_to_string(Duplex d) _const_;
+Duplex duplex_from_string(const char *d) _pure_;
+
+int wol_options_to_string_alloc(uint32_t opts, char **ret);
+
+const char *port_to_string(NetDevPort port) _const_;
+NetDevPort port_from_string(const char *port) _pure_;
+
+const char *mdi_to_string(int mdi) _const_;
+
+const char *ethtool_link_mode_bit_to_string(enum ethtool_link_mode_bit_indices val) _const_;
+enum ethtool_link_mode_bit_indices ethtool_link_mode_bit_from_string(const char *str) _pure_;
+
+CONFIG_PARSER_PROTOTYPE(config_parse_duplex);
+CONFIG_PARSER_PROTOTYPE(config_parse_wol);
+CONFIG_PARSER_PROTOTYPE(config_parse_port);
+CONFIG_PARSER_PROTOTYPE(config_parse_mdi);
+CONFIG_PARSER_PROTOTYPE(config_parse_advertise);
+CONFIG_PARSER_PROTOTYPE(config_parse_ring_buffer_or_channel);
+CONFIG_PARSER_PROTOTYPE(config_parse_coalesce_u32);
+CONFIG_PARSER_PROTOTYPE(config_parse_coalesce_sec);
+CONFIG_PARSER_PROTOTYPE(config_parse_nic_coalesce_setting);
diff --git a/src/shared/exec-util.c b/src/shared/exec-util.c
new file mode 100644
index 0000000..c27f3a5
--- /dev/null
+++ b/src/shared/exec-util.c
@@ -0,0 +1,605 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <dirent.h>
+#include <errno.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "conf-files.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "exec-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hashmap.h"
+#include "macro.h"
+#include "missing_syscall.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "set.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "tmpfile-util.h"
+
+#define EXIT_SKIP_REMAINING 77
+
+/* Put this test here for a lack of better place */
+assert_cc(EAGAIN == EWOULDBLOCK);
+
+static int do_spawn(const char *path, char *argv[], int stdout_fd, pid_t *pid, bool set_systemd_exec_pid) {
+ pid_t _pid;
+ int r;
+
+ if (null_or_empty_path(path) > 0) {
+ log_debug("%s is empty (a mask).", path);
+ return 0;
+ }
+
+ r = safe_fork("(direxec)", FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE, &_pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ char *_argv[2];
+
+ if (stdout_fd >= 0) {
+ r = rearrange_stdio(STDIN_FILENO, TAKE_FD(stdout_fd), STDERR_FILENO);
+ if (r < 0)
+ _exit(EXIT_FAILURE);
+ }
+
+ if (set_systemd_exec_pid) {
+ r = setenv_systemd_exec_pid(false);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set $SYSTEMD_EXEC_PID, ignoring: %m");
+ }
+
+ if (!argv) {
+ _argv[0] = (char*) path;
+ _argv[1] = NULL;
+ argv = _argv;
+ } else
+ argv[0] = (char*) path;
+
+ execv(path, argv);
+ log_error_errno(errno, "Failed to execute %s: %m", path);
+ _exit(EXIT_FAILURE);
+ }
+
+ *pid = _pid;
+ return 1;
+}
+
+static int do_execute(
+ char* const* paths,
+ const char *root,
+ usec_t timeout,
+ gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX],
+ void* const callback_args[_STDOUT_CONSUME_MAX],
+ int output_fd,
+ char *argv[],
+ char *envp[],
+ ExecDirFlags flags) {
+
+ _cleanup_hashmap_free_free_ Hashmap *pids = NULL;
+ bool parallel_execution;
+ int r;
+
+ /* We fork this all off from a child process so that we can somewhat cleanly make
+ * use of SIGALRM to set a time limit.
+ *
+ * We attempt to perform parallel execution if configured by the user, however
+ * if `callbacks` is nonnull, execution must be serial.
+ */
+ parallel_execution = FLAGS_SET(flags, EXEC_DIR_PARALLEL) && !callbacks;
+
+ if (parallel_execution) {
+ pids = hashmap_new(NULL);
+ if (!pids)
+ return log_oom();
+ }
+
+ /* Abort execution of this process after the timeout. We simply rely on SIGALRM as
+ * default action terminating the process, and turn on alarm(). */
+
+ if (timeout != USEC_INFINITY)
+ alarm(DIV_ROUND_UP(timeout, USEC_PER_SEC));
+
+ STRV_FOREACH(e, envp)
+ if (putenv(*e) != 0)
+ return log_error_errno(errno, "Failed to set environment variable: %m");
+
+ STRV_FOREACH(path, paths) {
+ _cleanup_free_ char *t = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ pid_t pid;
+
+ t = path_join(root, *path);
+ if (!t)
+ return log_oom();
+
+ if (callbacks) {
+ _cleanup_free_ char *bn = NULL;
+
+ r = path_extract_filename(*path, &bn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract filename from path '%s': %m", *path);
+
+ fd = open_serialization_fd(bn);
+ if (fd < 0)
+ return log_error_errno(fd, "Failed to open serialization file: %m");
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *args = NULL;
+ if (argv)
+ args = quote_command_line(strv_skip(argv, 1), SHELL_ESCAPE_EMPTY);
+
+ log_debug("About to execute %s%s%s", t, argv ? " " : "", argv ? strnull(args) : "");
+ }
+
+ r = do_spawn(t, argv, fd, &pid, FLAGS_SET(flags, EXEC_DIR_SET_SYSTEMD_EXEC_PID));
+ if (r <= 0)
+ continue;
+
+ if (parallel_execution) {
+ r = hashmap_put(pids, PID_TO_PTR(pid), t);
+ if (r < 0)
+ return log_oom();
+ t = NULL;
+ } else {
+ bool skip_remaining = false;
+
+ r = wait_for_terminate_and_check(t, pid, WAIT_LOG_ABNORMAL);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ if (FLAGS_SET(flags, EXEC_DIR_SKIP_REMAINING) && r == EXIT_SKIP_REMAINING) {
+ log_info("%s succeeded with exit status %i, not executing remaining executables.", *path, r);
+ skip_remaining = true;
+ } else if (FLAGS_SET(flags, EXEC_DIR_IGNORE_ERRORS))
+ log_warning("%s failed with exit status %i, ignoring.", *path, r);
+ else {
+ log_error("%s failed with exit status %i.", *path, r);
+ return r;
+ }
+ }
+
+ if (callbacks) {
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to seek on serialization fd: %m");
+
+ r = callbacks[STDOUT_GENERATE](TAKE_FD(fd), callback_args[STDOUT_GENERATE]);
+ if (r < 0)
+ return log_error_errno(r, "Failed to process output from %s: %m", *path);
+ }
+
+ if (skip_remaining)
+ break;
+ }
+ }
+
+ if (callbacks) {
+ r = callbacks[STDOUT_COLLECT](output_fd, callback_args[STDOUT_COLLECT]);
+ if (r < 0)
+ return log_error_errno(r, "Callback two failed: %m");
+ }
+
+ while (!hashmap_isempty(pids)) {
+ _cleanup_free_ char *t = NULL;
+ pid_t pid;
+
+ pid = PTR_TO_PID(hashmap_first_key(pids));
+ assert(pid > 0);
+
+ t = hashmap_remove(pids, PID_TO_PTR(pid));
+ assert(t);
+
+ r = wait_for_terminate_and_check(t, pid, WAIT_LOG);
+ if (r < 0)
+ return r;
+ if (!FLAGS_SET(flags, EXEC_DIR_IGNORE_ERRORS) && r > 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int execute_strv(
+ const char *name,
+ char* const* paths,
+ const char *root,
+ usec_t timeout,
+ gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX],
+ void* const callback_args[_STDOUT_CONSUME_MAX],
+ char *argv[],
+ char *envp[],
+ ExecDirFlags flags) {
+
+ _cleanup_close_ int fd = -EBADF;
+ pid_t executor_pid;
+ int r;
+
+ assert(!FLAGS_SET(flags, EXEC_DIR_PARALLEL | EXEC_DIR_SKIP_REMAINING));
+
+ if (strv_isempty(paths))
+ return 0;
+
+ if (callbacks) {
+ assert(name);
+ assert(callback_args);
+ assert(callbacks[STDOUT_GENERATE]);
+ assert(callbacks[STDOUT_COLLECT]);
+ assert(callbacks[STDOUT_CONSUME]);
+
+ fd = open_serialization_fd(name);
+ if (fd < 0)
+ return log_error_errno(fd, "Failed to open serialization file: %m");
+ }
+
+ /* Executes all binaries in the directories serially or in parallel and waits for
+ * them to finish. Optionally a timeout is applied. If a file with the same name
+ * exists in more than one directory, the earliest one wins. */
+
+ r = safe_fork("(sd-exec-strv)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_LOG, &executor_pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ r = do_execute(paths, root, timeout, callbacks, callback_args, fd, argv, envp, flags);
+ _exit(r < 0 ? EXIT_FAILURE : r);
+ }
+
+ r = wait_for_terminate_and_check("(sd-exec-strv)", executor_pid, 0);
+ if (r < 0)
+ return r;
+ if (!FLAGS_SET(flags, EXEC_DIR_IGNORE_ERRORS) && r > 0)
+ return r;
+
+ if (!callbacks)
+ return 0;
+
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to rewind serialization fd: %m");
+
+ r = callbacks[STDOUT_CONSUME](TAKE_FD(fd), callback_args[STDOUT_CONSUME]);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse returned data: %m");
+ return 0;
+}
+
+int execute_directories(
+ const char* const* directories,
+ usec_t timeout,
+ gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX],
+ void* const callback_args[_STDOUT_CONSUME_MAX],
+ char *argv[],
+ char *envp[],
+ ExecDirFlags flags) {
+
+ _cleanup_strv_free_ char **paths = NULL;
+ _cleanup_free_ char *name = NULL;
+ int r;
+
+ assert(!strv_isempty((char**) directories));
+
+ r = conf_files_list_strv(&paths, NULL, NULL, CONF_FILES_EXECUTABLE|CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED, directories);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enumerate executables: %m");
+
+ if (strv_isempty(paths)) {
+ log_debug("No executables found.");
+ return 0;
+ }
+
+ if (callbacks) {
+ r = path_extract_filename(directories[0], &name);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract file name from '%s': %m", directories[0]);
+ }
+
+ return execute_strv(name, paths, NULL, timeout, callbacks, callback_args, argv, envp, flags);
+}
+
+static int gather_environment_generate(int fd, void *arg) {
+ char ***env = ASSERT_PTR(arg);
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_strv_free_ char **new = NULL;
+ int r;
+
+ /* Read a series of VAR=value assignments from fd, use them to update the list of
+ * variables in env. Also update the exported environment.
+ *
+ * fd is always consumed, even on error.
+ */
+
+ f = fdopen(fd, "r");
+ if (!f) {
+ safe_close(fd);
+ return -errno;
+ }
+
+ r = load_env_file_pairs(f, NULL, &new);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH_PAIR(x, y, new) {
+ if (!env_name_is_valid(*x)) {
+ log_warning("Invalid variable assignment \"%s=...\", ignoring.", *x);
+ continue;
+ }
+
+ r = strv_env_assign(env, *x, *y);
+ if (r < 0)
+ return r;
+
+ if (setenv(*x, *y, true) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int gather_environment_collect(int fd, void *arg) {
+ _cleanup_fclose_ FILE *f = NULL;
+ char ***env = ASSERT_PTR(arg);
+ int r;
+
+ /* Write out a series of env=cescape(VAR=value) assignments to fd. */
+
+ f = fdopen(fd, "w");
+ if (!f) {
+ safe_close(fd);
+ return -errno;
+ }
+
+ r = serialize_strv(f, "env", *env);
+ if (r < 0)
+ return r;
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int gather_environment_consume(int fd, void *arg) {
+ _cleanup_fclose_ FILE *f = NULL;
+ char ***env = ASSERT_PTR(arg);
+ int r = 0;
+
+ /* Read a series of env=cescape(VAR=value) assignments from fd into env. */
+
+ f = fdopen(fd, "r");
+ if (!f) {
+ safe_close(fd);
+ return -errno;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ const char *v;
+ int k;
+
+ k = read_line(f, LONG_LINE_MAX, &line);
+ if (k < 0)
+ return k;
+ if (k == 0)
+ break;
+
+ v = startswith(line, "env=");
+ if (!v) {
+ log_debug("Serialization line \"%s\" unexpectedly didn't start with \"env=\".", line);
+ if (r == 0)
+ r = -EINVAL;
+
+ continue;
+ }
+
+ k = deserialize_environment(v, env);
+ if (k < 0) {
+ log_debug_errno(k, "Invalid serialization line \"%s\": %m", line);
+
+ if (r == 0)
+ r = k;
+ }
+ }
+
+ return r;
+}
+
+int exec_command_flags_from_strv(char **ex_opts, ExecCommandFlags *flags) {
+ ExecCommandFlags ex_flag, ret_flags = 0;
+
+ assert(flags);
+
+ STRV_FOREACH(opt, ex_opts) {
+ ex_flag = exec_command_flags_from_string(*opt);
+ if (ex_flag < 0)
+ return ex_flag;
+ ret_flags |= ex_flag;
+ }
+
+ *flags = ret_flags;
+
+ return 0;
+}
+
+int exec_command_flags_to_strv(ExecCommandFlags flags, char ***ex_opts) {
+ _cleanup_strv_free_ char **ret_opts = NULL;
+ ExecCommandFlags it = flags;
+ const char *str;
+ int r;
+
+ assert(ex_opts);
+
+ if (flags < 0)
+ return flags;
+
+ for (unsigned i = 0; it != 0; it &= ~(1 << i), i++)
+ if (FLAGS_SET(flags, (1 << i))) {
+ str = exec_command_flags_to_string(1 << i);
+ if (!str)
+ return -EINVAL;
+
+ r = strv_extend(&ret_opts, str);
+ if (r < 0)
+ return r;
+ }
+
+ *ex_opts = TAKE_PTR(ret_opts);
+
+ return 0;
+}
+
+const gather_stdout_callback_t gather_environment[] = {
+ gather_environment_generate,
+ gather_environment_collect,
+ gather_environment_consume,
+};
+
+static const char* const exec_command_strings[] = {
+ "ignore-failure", /* EXEC_COMMAND_IGNORE_FAILURE */
+ "privileged", /* EXEC_COMMAND_FULLY_PRIVILEGED */
+ "no-setuid", /* EXEC_COMMAND_NO_SETUID */
+ "ambient", /* EXEC_COMMAND_AMBIENT_MAGIC */
+ "no-env-expand", /* EXEC_COMMAND_NO_ENV_EXPAND */
+};
+
+const char* exec_command_flags_to_string(ExecCommandFlags i) {
+ for (size_t idx = 0; idx < ELEMENTSOF(exec_command_strings); idx++)
+ if (i == (1 << idx))
+ return exec_command_strings[idx];
+
+ return NULL;
+}
+
+ExecCommandFlags exec_command_flags_from_string(const char *s) {
+ ssize_t idx;
+
+ idx = string_table_lookup(exec_command_strings, ELEMENTSOF(exec_command_strings), s);
+
+ if (idx < 0)
+ return _EXEC_COMMAND_FLAGS_INVALID;
+ else
+ return 1 << idx;
+}
+
+int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]) {
+ /* Refuse invalid fds, regardless if fexecve() use is enabled or not */
+ if (executable_fd < 0)
+ return -EBADF;
+
+ /* Block any attempts on exploiting Linux' liberal argv[] handling, i.e. CVE-2021-4034 and suchlike */
+ if (isempty(executable) || strv_isempty(argv))
+ return -EINVAL;
+
+#if ENABLE_FEXECVE
+
+ execveat(executable_fd, "", argv, envp, AT_EMPTY_PATH);
+
+ if (IN_SET(errno, ENOSYS, ENOENT) || ERRNO_IS_PRIVILEGE(errno))
+ /* Old kernel or a script or an overzealous seccomp filter? Let's fall back to execve().
+ *
+ * fexecve(3): "If fd refers to a script (i.e., it is an executable text file that names a
+ * script interpreter with a first line that begins with the characters #!) and the
+ * close-on-exec flag has been set for fd, then fexecve() fails with the error ENOENT. This
+ * error occurs because, by the time the script interpreter is executed, fd has already been
+ * closed because of the close-on-exec flag. Thus, the close-on-exec flag can't be set on fd
+ * if it refers to a script."
+ *
+ * Unfortunately, if we unset close-on-exec, the script will be executed just fine, but (at
+ * least in case of bash) the script name, $0, will be shown as /dev/fd/nnn, which breaks
+ * scripts which make use of $0. Thus, let's fall back to execve() in this case.
+ */
+#endif
+ execve(executable, argv, envp);
+ return -errno;
+}
+
+int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) {
+ bool stdout_is_tty, stderr_is_tty;
+ size_t n, i;
+ va_list ap;
+ char **l;
+ int r;
+
+ assert(path);
+
+ /* Spawns a temporary TTY agent, making sure it goes away when we go away */
+
+ r = safe_fork_full(name,
+ NULL,
+ except,
+ n_except,
+ FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|FORK_CLOSE_ALL_FDS|FORK_REOPEN_LOG|FORK_RLIMIT_NOFILE_SAFE,
+ ret_pid);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 0;
+
+ /* In the child: */
+
+ stdout_is_tty = isatty(STDOUT_FILENO);
+ stderr_is_tty = isatty(STDERR_FILENO);
+
+ if (!stdout_is_tty || !stderr_is_tty) {
+ int fd;
+
+ /* Detach from stdout/stderr and reopen /dev/tty for them. This is important to ensure that
+ * when systemctl is started via popen() or a similar call that expects to read EOF we
+ * actually do generate EOF and not delay this indefinitely by keeping an unused copy of
+ * stdin around. */
+ fd = open("/dev/tty", O_WRONLY);
+ if (fd < 0) {
+ if (errno != ENXIO) {
+ log_error_errno(errno, "Failed to open /dev/tty: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ /* If we get ENXIO here we have no controlling TTY even though stdout/stderr are
+ * connected to a TTY. That's a weird setup, but let's handle it gracefully: let's
+ * skip the forking of the agents, given the TTY setup is not in order. */
+ } else {
+ if (!stdout_is_tty && dup2(fd, STDOUT_FILENO) < 0) {
+ log_error_errno(errno, "Failed to dup2 /dev/tty: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (!stderr_is_tty && dup2(fd, STDERR_FILENO) < 0) {
+ log_error_errno(errno, "Failed to dup2 /dev/tty: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ fd = safe_close_above_stdio(fd);
+ }
+ }
+
+ /* Count arguments */
+ va_start(ap, path);
+ for (n = 0; va_arg(ap, char*); n++)
+ ;
+ va_end(ap);
+
+ /* Allocate strv */
+ l = newa(char*, n + 1);
+
+ /* Fill in arguments */
+ va_start(ap, path);
+ for (i = 0; i <= n; i++)
+ l[i] = va_arg(ap, char*);
+ va_end(ap);
+
+ execv(path, l);
+ _exit(EXIT_FAILURE);
+}
diff --git a/src/shared/exec-util.h b/src/shared/exec-util.h
new file mode 100644
index 0000000..b99336e
--- /dev/null
+++ b/src/shared/exec-util.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "time-util.h"
+
+typedef int (*gather_stdout_callback_t) (int fd, void *arg);
+
+enum {
+ STDOUT_GENERATE, /* from generators to helper process */
+ STDOUT_COLLECT, /* from helper process to main process */
+ STDOUT_CONSUME, /* process data in main process */
+ _STDOUT_CONSUME_MAX,
+};
+
+typedef enum {
+ EXEC_DIR_NONE = 0, /* No execdir flags */
+ EXEC_DIR_PARALLEL = 1 << 0, /* Execute scripts in parallel, if possible */
+ EXEC_DIR_IGNORE_ERRORS = 1 << 1, /* Ignore non-zero exit status of scripts */
+ EXEC_DIR_SET_SYSTEMD_EXEC_PID = 1 << 2, /* Set $SYSTEMD_EXEC_PID environment variable */
+ EXEC_DIR_SKIP_REMAINING = 1 << 3, /* Ignore remaining executions when one exit with 77. */
+} ExecDirFlags;
+
+typedef enum ExecCommandFlags {
+ EXEC_COMMAND_IGNORE_FAILURE = 1 << 0,
+ EXEC_COMMAND_FULLY_PRIVILEGED = 1 << 1,
+ EXEC_COMMAND_NO_SETUID = 1 << 2,
+ EXEC_COMMAND_AMBIENT_MAGIC = 1 << 3,
+ EXEC_COMMAND_NO_ENV_EXPAND = 1 << 4,
+ _EXEC_COMMAND_FLAGS_INVALID = -EINVAL,
+} ExecCommandFlags;
+
+int execute_strv(
+ const char *name,
+ char* const* paths,
+ const char *root,
+ usec_t timeout,
+ gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX],
+ void* const callback_args[_STDOUT_CONSUME_MAX],
+ char *argv[],
+ char *envp[],
+ ExecDirFlags flags);
+
+int execute_directories(
+ const char* const* directories,
+ usec_t timeout,
+ gather_stdout_callback_t const callbacks[_STDOUT_CONSUME_MAX],
+ void* const callback_args[_STDOUT_CONSUME_MAX],
+ char *argv[],
+ char *envp[],
+ ExecDirFlags flags);
+
+int exec_command_flags_from_strv(char **ex_opts, ExecCommandFlags *flags);
+int exec_command_flags_to_strv(ExecCommandFlags flags, char ***ex_opts);
+
+extern const gather_stdout_callback_t gather_environment[_STDOUT_CONSUME_MAX];
+
+const char* exec_command_flags_to_string(ExecCommandFlags i);
+ExecCommandFlags exec_command_flags_from_string(const char *s);
+
+int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]);
+
+int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) _sentinel_;
diff --git a/src/shared/exit-status.c b/src/shared/exit-status.c
new file mode 100644
index 0000000..0ac688b
--- /dev/null
+++ b/src/shared/exit-status.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <signal.h>
+#include <stdlib.h>
+#include <sysexits.h>
+
+#include "exit-status.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "set.h"
+#include "string-util.h"
+
+const ExitStatusMapping exit_status_mappings[256] = {
+ /* Exit status ranges:
+ *
+ * 0…1 │ ISO C, EXIT_SUCCESS + EXIT_FAILURE
+ * 2…7 │ LSB exit codes for init scripts
+ * 8…63 │ (Currently unmapped)
+ * 64…78 │ BSD defined exit codes
+ * 79…199 │ (Currently unmapped)
+ * 200…244 │ systemd's private error codes (might be extended to 254 in future development)
+ * 245…254 │ (Currently unmapped, but see above)
+ *
+ * 255 │ EXIT_EXCEPTION (We use this to propagate exit-by-signal events. It's frequently used by others apps (like bash)
+ * │ to indicate exit reason that cannot really be expressed in a single exit status value — such as a propagated
+ * │ signal or such, and we follow that logic here.)
+ */
+
+ [EXIT_SUCCESS] = { "SUCCESS", EXIT_STATUS_LIBC },
+ [EXIT_FAILURE] = { "FAILURE", EXIT_STATUS_LIBC },
+
+ [EXIT_CHDIR] = { "CHDIR", EXIT_STATUS_SYSTEMD },
+ [EXIT_NICE] = { "NICE", EXIT_STATUS_SYSTEMD },
+ [EXIT_FDS] = { "FDS", EXIT_STATUS_SYSTEMD },
+ [EXIT_EXEC] = { "EXEC", EXIT_STATUS_SYSTEMD },
+ [EXIT_MEMORY] = { "MEMORY", EXIT_STATUS_SYSTEMD },
+ [EXIT_LIMITS] = { "LIMITS", EXIT_STATUS_SYSTEMD },
+ [EXIT_OOM_ADJUST] = { "OOM_ADJUST", EXIT_STATUS_SYSTEMD },
+ [EXIT_SIGNAL_MASK] = { "SIGNAL_MASK", EXIT_STATUS_SYSTEMD },
+ [EXIT_STDIN] = { "STDIN", EXIT_STATUS_SYSTEMD },
+ [EXIT_STDOUT] = { "STDOUT", EXIT_STATUS_SYSTEMD },
+ [EXIT_CHROOT] = { "CHROOT", EXIT_STATUS_SYSTEMD },
+ [EXIT_IOPRIO] = { "IOPRIO", EXIT_STATUS_SYSTEMD },
+ [EXIT_TIMERSLACK] = { "TIMERSLACK", EXIT_STATUS_SYSTEMD },
+ [EXIT_SECUREBITS] = { "SECUREBITS", EXIT_STATUS_SYSTEMD },
+ [EXIT_SETSCHEDULER] = { "SETSCHEDULER", EXIT_STATUS_SYSTEMD },
+ [EXIT_CPUAFFINITY] = { "CPUAFFINITY", EXIT_STATUS_SYSTEMD },
+ [EXIT_GROUP] = { "GROUP", EXIT_STATUS_SYSTEMD },
+ [EXIT_USER] = { "USER", EXIT_STATUS_SYSTEMD },
+ [EXIT_CAPABILITIES] = { "CAPABILITIES", EXIT_STATUS_SYSTEMD },
+ [EXIT_CGROUP] = { "CGROUP", EXIT_STATUS_SYSTEMD },
+ [EXIT_SETSID] = { "SETSID", EXIT_STATUS_SYSTEMD },
+ [EXIT_CONFIRM] = { "CONFIRM", EXIT_STATUS_SYSTEMD },
+ [EXIT_STDERR] = { "STDERR", EXIT_STATUS_SYSTEMD },
+ [EXIT_PAM] = { "PAM", EXIT_STATUS_SYSTEMD },
+ [EXIT_NETWORK] = { "NETWORK", EXIT_STATUS_SYSTEMD },
+ [EXIT_NAMESPACE] = { "NAMESPACE", EXIT_STATUS_SYSTEMD },
+ [EXIT_NO_NEW_PRIVILEGES] = { "NO_NEW_PRIVILEGES", EXIT_STATUS_SYSTEMD },
+ [EXIT_SECCOMP] = { "SECCOMP", EXIT_STATUS_SYSTEMD },
+ [EXIT_SELINUX_CONTEXT] = { "SELINUX_CONTEXT", EXIT_STATUS_SYSTEMD },
+ [EXIT_PERSONALITY] = { "PERSONALITY", EXIT_STATUS_SYSTEMD },
+ [EXIT_APPARMOR_PROFILE] = { "APPARMOR", EXIT_STATUS_SYSTEMD },
+ [EXIT_ADDRESS_FAMILIES] = { "ADDRESS_FAMILIES", EXIT_STATUS_SYSTEMD },
+ [EXIT_RUNTIME_DIRECTORY] = { "RUNTIME_DIRECTORY", EXIT_STATUS_SYSTEMD },
+ [EXIT_CHOWN] = { "CHOWN", EXIT_STATUS_SYSTEMD },
+ [EXIT_SMACK_PROCESS_LABEL] = { "SMACK_PROCESS_LABEL", EXIT_STATUS_SYSTEMD },
+ [EXIT_KEYRING] = { "KEYRING", EXIT_STATUS_SYSTEMD },
+ [EXIT_STATE_DIRECTORY] = { "STATE_DIRECTORY", EXIT_STATUS_SYSTEMD },
+ [EXIT_CACHE_DIRECTORY] = { "CACHE_DIRECTORY", EXIT_STATUS_SYSTEMD },
+ [EXIT_LOGS_DIRECTORY] = { "LOGS_DIRECTORY", EXIT_STATUS_SYSTEMD },
+ [EXIT_CONFIGURATION_DIRECTORY] = { "CONFIGURATION_DIRECTORY", EXIT_STATUS_SYSTEMD },
+ [EXIT_NUMA_POLICY] = { "NUMA_POLICY", EXIT_STATUS_SYSTEMD },
+ [EXIT_CREDENTIALS] = { "CREDENTIALS", EXIT_STATUS_SYSTEMD },
+ [EXIT_BPF] = { "BPF", EXIT_STATUS_SYSTEMD },
+ [EXIT_KSM] = { "KSM", EXIT_STATUS_SYSTEMD },
+
+ [EXIT_EXCEPTION] = { "EXCEPTION", EXIT_STATUS_SYSTEMD },
+
+ [EXIT_INVALIDARGUMENT] = { "INVALIDARGUMENT", EXIT_STATUS_LSB },
+ [EXIT_NOTIMPLEMENTED] = { "NOTIMPLEMENTED", EXIT_STATUS_LSB },
+ [EXIT_NOPERMISSION] = { "NOPERMISSION", EXIT_STATUS_LSB },
+ [EXIT_NOTINSTALLED] = { "NOTINSTALLED", EXIT_STATUS_LSB },
+ [EXIT_NOTCONFIGURED] = { "NOTCONFIGURED", EXIT_STATUS_LSB },
+ [EXIT_NOTRUNNING] = { "NOTRUNNING", EXIT_STATUS_LSB },
+
+ [EX_USAGE] = { "USAGE", EXIT_STATUS_BSD },
+ [EX_DATAERR] = { "DATAERR", EXIT_STATUS_BSD },
+ [EX_NOINPUT] = { "NOINPUT", EXIT_STATUS_BSD },
+ [EX_NOUSER] = { "NOUSER", EXIT_STATUS_BSD },
+ [EX_NOHOST] = { "NOHOST", EXIT_STATUS_BSD },
+ [EX_UNAVAILABLE] = { "UNAVAILABLE", EXIT_STATUS_BSD },
+ [EX_SOFTWARE] = { "SOFTWARE", EXIT_STATUS_BSD },
+ [EX_OSERR] = { "OSERR", EXIT_STATUS_BSD },
+ [EX_OSFILE] = { "OSFILE", EXIT_STATUS_BSD },
+ [EX_CANTCREAT] = { "CANTCREAT", EXIT_STATUS_BSD },
+ [EX_IOERR] = { "IOERR", EXIT_STATUS_BSD },
+ [EX_TEMPFAIL] = { "TEMPFAIL", EXIT_STATUS_BSD },
+ [EX_PROTOCOL] = { "PROTOCOL", EXIT_STATUS_BSD },
+ [EX_NOPERM] = { "NOPERM", EXIT_STATUS_BSD },
+ [EX_CONFIG] = { "CONFIG", EXIT_STATUS_BSD },
+};
+
+const char* exit_status_to_string(int code, ExitStatusClass class) {
+ if (code < 0 || (size_t) code >= ELEMENTSOF(exit_status_mappings))
+ return NULL;
+ return class & exit_status_mappings[code].class ? exit_status_mappings[code].name : NULL;
+}
+
+const char* exit_status_class(int code) {
+ if (code < 0 || (size_t) code >= ELEMENTSOF(exit_status_mappings))
+ return NULL;
+
+ switch (exit_status_mappings[code].class) {
+ case EXIT_STATUS_LIBC:
+ return "libc";
+ case EXIT_STATUS_SYSTEMD:
+ return "systemd";
+ case EXIT_STATUS_LSB:
+ return "LSB";
+ case EXIT_STATUS_BSD:
+ return "BSD";
+ default: return NULL;
+ }
+}
+
+int exit_status_from_string(const char *s) {
+ uint8_t val;
+ int r;
+
+ for (size_t i = 0; i < ELEMENTSOF(exit_status_mappings); i++)
+ if (streq_ptr(s, exit_status_mappings[i].name))
+ return i;
+
+ r = safe_atou8(s, &val);
+ if (r < 0)
+ return r;
+
+ return val;
+}
+
+bool is_clean_exit(int code, int status, ExitClean clean, const ExitStatusSet *success_status) {
+ if (code == CLD_EXITED)
+ return status == 0 ||
+ (success_status &&
+ bitmap_isset(&success_status->status, status));
+
+ /* If a daemon does not implement handlers for some of the signals, we do not consider this an
+ * unclean shutdown */
+ if (code == CLD_KILLED)
+ return (clean == EXIT_CLEAN_DAEMON && IN_SET(status, SIGHUP, SIGINT, SIGTERM, SIGPIPE)) ||
+ (success_status &&
+ bitmap_isset(&success_status->signal, status));
+
+ return false;
+}
+
+void exit_status_set_free(ExitStatusSet *x) {
+ assert(x);
+
+ bitmap_clear(&x->status);
+ bitmap_clear(&x->signal);
+}
+
+bool exit_status_set_is_empty(const ExitStatusSet *x) {
+ if (!x)
+ return true;
+
+ return bitmap_isclear(&x->status) && bitmap_isclear(&x->signal);
+}
+
+bool exit_status_set_test(const ExitStatusSet *x, int code, int status) {
+ if (code == CLD_EXITED && bitmap_isset(&x->status, status))
+ return true;
+
+ if (IN_SET(code, CLD_KILLED, CLD_DUMPED) && bitmap_isset(&x->signal, status))
+ return true;
+
+ return false;
+}
diff --git a/src/shared/exit-status.h b/src/shared/exit-status.h
new file mode 100644
index 0000000..c22cba0
--- /dev/null
+++ b/src/shared/exit-status.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bitmap.h"
+#include "hashmap.h"
+#include "macro.h"
+
+/* This defines pretty names for the LSB 'start' verb exit codes. Note that they shouldn't be confused with
+ * the LSB 'status' verb exit codes which are defined very differently. For details see:
+ *
+ * https://refspecs.linuxbase.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/iniscrptact.html
+ */
+
+enum {
+ /* EXIT_SUCCESS defined by libc */
+ /* EXIT_FAILURE defined by libc */
+ EXIT_INVALIDARGUMENT = 2,
+ EXIT_NOTIMPLEMENTED = 3,
+ EXIT_NOPERMISSION = 4,
+ EXIT_NOTINSTALLED = 5,
+ EXIT_NOTCONFIGURED = 6,
+ EXIT_NOTRUNNING = 7,
+
+ /* BSD's sysexits.h defines a couple EX_xyz exit codes in the range 64 … 78 */
+
+ /* The LSB suggests that error codes >= 200 are "reserved". We use them here under the assumption
+ * that they hence are unused by init scripts. */
+ EXIT_CHDIR = 200,
+ EXIT_NICE,
+ EXIT_FDS,
+ EXIT_EXEC,
+ EXIT_MEMORY,
+ EXIT_LIMITS,
+ EXIT_OOM_ADJUST,
+ EXIT_SIGNAL_MASK,
+ EXIT_STDIN,
+ EXIT_STDOUT,
+ EXIT_CHROOT, /* 210 */
+ EXIT_IOPRIO,
+ EXIT_TIMERSLACK,
+ EXIT_SECUREBITS,
+ EXIT_SETSCHEDULER,
+ EXIT_CPUAFFINITY,
+ EXIT_GROUP,
+ EXIT_USER,
+ EXIT_CAPABILITIES,
+ EXIT_CGROUP,
+ EXIT_SETSID, /* 220 */
+ EXIT_CONFIRM,
+ EXIT_STDERR,
+ _EXIT_RESERVED, /* used to be tcpwrap, don't reuse! */
+ EXIT_PAM,
+ EXIT_NETWORK,
+ EXIT_NAMESPACE,
+ EXIT_NO_NEW_PRIVILEGES,
+ EXIT_SECCOMP,
+ EXIT_SELINUX_CONTEXT,
+ EXIT_PERSONALITY, /* 230 */
+ EXIT_APPARMOR_PROFILE,
+ EXIT_ADDRESS_FAMILIES,
+ EXIT_RUNTIME_DIRECTORY,
+ _EXIT_RESERVED2, /* used to be used by kdbus, don't reuse */
+ EXIT_CHOWN,
+ EXIT_SMACK_PROCESS_LABEL,
+ EXIT_KEYRING,
+ EXIT_STATE_DIRECTORY,
+ EXIT_CACHE_DIRECTORY,
+ EXIT_LOGS_DIRECTORY, /* 240 */
+ EXIT_CONFIGURATION_DIRECTORY,
+ EXIT_NUMA_POLICY,
+ EXIT_CREDENTIALS,
+ EXIT_BPF,
+ EXIT_KSM,
+
+ EXIT_EXCEPTION = 255, /* Whenever we want to propagate an abnormal/signal exit, in line with bash */
+};
+
+typedef enum ExitStatusClass {
+ EXIT_STATUS_LIBC = 1 << 0, /* libc EXIT_STATUS/EXIT_FAILURE */
+ EXIT_STATUS_SYSTEMD = 1 << 1, /* systemd's own exit codes */
+ EXIT_STATUS_LSB = 1 << 2, /* LSB exit codes */
+ EXIT_STATUS_BSD = 1 << 3, /* BSD (EX_xyz) exit codes */
+ EXIT_STATUS_FULL = EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD | EXIT_STATUS_LSB | EXIT_STATUS_BSD,
+} ExitStatusClass;
+
+typedef struct ExitStatusSet {
+ Bitmap status;
+ Bitmap signal;
+} ExitStatusSet;
+
+const char* exit_status_to_string(int code, ExitStatusClass class) _const_;
+const char* exit_status_class(int code) _const_;
+int exit_status_from_string(const char *s) _pure_;
+
+typedef struct ExitStatusMapping {
+ const char *name;
+ ExitStatusClass class;
+} ExitStatusMapping;
+
+extern const ExitStatusMapping exit_status_mappings[256];
+
+typedef enum ExitClean {
+ EXIT_CLEAN_DAEMON,
+ EXIT_CLEAN_COMMAND,
+} ExitClean;
+
+bool is_clean_exit(int code, int status, ExitClean clean, const ExitStatusSet *success_status);
+
+void exit_status_set_free(ExitStatusSet *x);
+bool exit_status_set_is_empty(const ExitStatusSet *x);
+bool exit_status_set_test(const ExitStatusSet *x, int code, int status);
diff --git a/src/shared/extension-util.c b/src/shared/extension-util.c
new file mode 100644
index 0000000..d8b16b9
--- /dev/null
+++ b/src/shared/extension-util.c
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "chase.h"
+#include "env-util.h"
+#include "extension-util.h"
+#include "log.h"
+#include "os-util.h"
+#include "strv.h"
+
+int extension_release_validate(
+ const char *name,
+ const char *host_os_release_id,
+ const char *host_os_release_version_id,
+ const char *host_os_extension_release_level,
+ const char *host_extension_scope,
+ char **extension_release,
+ ImageClass image_class) {
+
+ const char *extension_release_id = NULL, *extension_release_level = NULL, *extension_architecture = NULL;
+ const char *extension_level = image_class == IMAGE_CONFEXT ? "CONFEXT_LEVEL" : "SYSEXT_LEVEL";
+ const char *extension_scope = image_class == IMAGE_CONFEXT ? "CONFEXT_SCOPE" : "SYSEXT_SCOPE";
+
+ assert(name);
+ assert(!isempty(host_os_release_id));
+
+ /* Now that we can look into the extension/confext image, let's see if the OS version is compatible */
+ if (strv_isempty(extension_release)) {
+ log_debug("Extension '%s' carries no release data, ignoring.", name);
+ return 0;
+ }
+
+ if (host_extension_scope) {
+ _cleanup_strv_free_ char **scope_list = NULL;
+ const char *scope;
+ bool valid;
+
+ scope = strv_env_pairs_get(extension_release, extension_scope);
+ if (scope) {
+ scope_list = strv_split(scope, WHITESPACE);
+ if (!scope_list)
+ return -ENOMEM;
+ }
+
+ /* By default extension are good for attachment in portable service and on the system */
+ valid = strv_contains(
+ scope_list ?: STRV_MAKE("system", "portable"),
+ host_extension_scope);
+ if (!valid) {
+ log_debug("Extension '%s' is not suitable for scope %s, ignoring.", name, host_extension_scope);
+ return 0;
+ }
+ }
+
+ /* When the architecture field is present and not '_any' it must match the host - for now just look at uname but in
+ * the future we could check if the kernel also supports 32 bit or binfmt has a translator set up for the architecture */
+ extension_architecture = strv_env_pairs_get(extension_release, "ARCHITECTURE");
+ if (!isempty(extension_architecture) && !streq(extension_architecture, "_any") &&
+ !streq(architecture_to_string(uname_architecture()), extension_architecture)) {
+ log_debug("Extension '%s' is for architecture '%s', but deployed on top of '%s'.",
+ name, extension_architecture, architecture_to_string(uname_architecture()));
+ return 0;
+ }
+
+ extension_release_id = strv_env_pairs_get(extension_release, "ID");
+ if (isempty(extension_release_id)) {
+ log_debug("Extension '%s' does not contain ID in release file but requested to match '%s' or be '_any'",
+ name, host_os_release_id);
+ return 0;
+ }
+
+ /* A sysext(or confext) with no host OS dependency (static binaries or scripts) can match
+ * '_any' host OS, and VERSION_ID or SYSEXT_LEVEL(or CONFEXT_LEVEL) are not required anywhere */
+ if (streq(extension_release_id, "_any")) {
+ log_debug("Extension '%s' matches '_any' OS.", name);
+ return 1;
+ }
+
+ if (!streq(host_os_release_id, extension_release_id)) {
+ log_debug("Extension '%s' is for OS '%s', but deployed on top of '%s'.",
+ name, extension_release_id, host_os_release_id);
+ return 0;
+ }
+
+ /* Rolling releases do not typically set VERSION_ID (eg: ArchLinux) */
+ if (isempty(host_os_release_version_id) && isempty(host_os_extension_release_level)) {
+ log_debug("No version info on the host (rolling release?), but ID in %s matched.", name);
+ return 1;
+ }
+
+ /* If the extension has a sysext API level declared, then it must match the host API
+ * level. Otherwise, compare OS version as a whole */
+ extension_release_level = strv_env_pairs_get(extension_release, extension_level);
+ if (!isempty(host_os_extension_release_level) && !isempty(extension_release_level)) {
+ if (!streq_ptr(host_os_extension_release_level, extension_release_level)) {
+ log_debug("Extension '%s' is for API level '%s', but running on API level '%s'",
+ name, strna(extension_release_level), strna(host_os_extension_release_level));
+ return 0;
+ }
+ } else if (!isempty(host_os_release_version_id)) {
+ const char *extension_release_version_id;
+
+ extension_release_version_id = strv_env_pairs_get(extension_release, "VERSION_ID");
+ if (isempty(extension_release_version_id)) {
+ log_debug("Extension '%s' does not contain VERSION_ID in release file but requested to match '%s'",
+ name, strna(host_os_release_version_id));
+ return 0;
+ }
+
+ if (!streq_ptr(host_os_release_version_id, extension_release_version_id)) {
+ log_debug("Extension '%s' is for OS '%s', but deployed on top of '%s'.",
+ name, strna(extension_release_version_id), strna(host_os_release_version_id));
+ return 0;
+ }
+ } else if (isempty(host_os_release_version_id) && isempty(host_os_extension_release_level)) {
+ /* Rolling releases do not typically set VERSION_ID (eg: ArchLinux) */
+ log_debug("No version info on the host (rolling release?), but ID in %s matched.", name);
+ return 1;
+ }
+
+ log_debug("Version info of extension '%s' matches host.", name);
+ return 1;
+}
+
+int parse_env_extension_hierarchies(char ***ret_hierarchies, const char *hierarchy_env) {
+ _cleanup_free_ char **l = NULL;
+ int r;
+
+ assert(hierarchy_env);
+ r = getenv_path_list(hierarchy_env, &l);
+ if (r == -ENXIO) {
+ if (streq(hierarchy_env, "SYSTEMD_CONFEXT_HIERARCHIES"))
+ /* Default for confext when unset */
+ l = strv_new("/etc");
+ else if (streq(hierarchy_env, "SYSTEMD_SYSEXT_HIERARCHIES"))
+ /* Default for sysext when unset */
+ l = strv_new("/usr", "/opt");
+ else if (streq(hierarchy_env, "SYSTEMD_SYSEXT_AND_CONFEXT_HIERARCHIES"))
+ /* Combined sysext and confext directories */
+ l = strv_new("/usr", "/opt", "/etc");
+ else
+ return -ENXIO;
+ } else if (r < 0)
+ return r;
+
+ *ret_hierarchies = TAKE_PTR(l);
+ return 0;
+}
+
+int extension_has_forbidden_content(const char *root) {
+ int r;
+
+ /* Insist that extension images do not overwrite the underlying OS release file (it's fine if
+ * they place one in /etc/os-release, i.e. where things don't matter, as they aren't
+ * merged.) */
+ r = chase("/usr/lib/os-release", root, CHASE_PREFIX_ROOT, NULL, NULL);
+ if (r > 0) {
+ log_debug("Extension contains '/usr/lib/os-release', which is not allowed, refusing.");
+ return 1;
+ }
+ if (r < 0 && r != -ENOENT)
+ return log_debug_errno(r, "Failed to determine whether '/usr/lib/os-release' exists in the extension: %m");
+
+ return 0;
+}
diff --git a/src/shared/extension-util.h b/src/shared/extension-util.h
new file mode 100644
index 0000000..3cad219
--- /dev/null
+++ b/src/shared/extension-util.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "os-util.h"
+
+/* Given an image name (for logging purposes), a set of os-release values from the host and a key-value pair
+ * vector of extension-release variables, check that the distro and (system extension level or distro
+ * version) match and return 1, and 0 otherwise. */
+int extension_release_validate(
+ const char *name,
+ const char *host_os_release_id,
+ const char *host_os_release_version_id,
+ const char *host_os_extension_release_level,
+ const char *host_extension_scope,
+ char **extension_release,
+ ImageClass image_class);
+
+/* Parse hierarchy variables and if not set, return "/usr /opt" for sysext and "/etc" for confext */
+int parse_env_extension_hierarchies(char ***ret_hierarchies, const char *hierarchy_env);
+
+/* Insist that extension images do not overwrite the underlying OS release file (it's fine if they place one
+ * in /etc/os-release, i.e. where things don't matter, as they aren't merged.) */
+int extension_has_forbidden_content(const char *root);
diff --git a/src/shared/fdisk-util.c b/src/shared/fdisk-util.c
new file mode 100644
index 0000000..20f32d1
--- /dev/null
+++ b/src/shared/fdisk-util.c
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dissect-image.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fdisk-util.h"
+#include "parse-util.h"
+
+#if HAVE_LIBFDISK
+
+int fdisk_new_context_at(
+ int dir_fd,
+ const char *path,
+ bool read_only,
+ uint32_t sector_size,
+ struct fdisk_context **ret) {
+
+ _cleanup_(fdisk_unref_contextp) struct fdisk_context *c = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(ret);
+
+ if (!isempty(path)) {
+ fd = openat(dir_fd, path, (read_only ? O_RDONLY : O_RDWR)|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+
+ dir_fd = fd;
+ }
+
+ c = fdisk_new_context();
+ if (!c)
+ return -ENOMEM;
+
+ if (sector_size == UINT32_MAX) {
+ r = probe_sector_size_prefer_ioctl(dir_fd, &sector_size);
+ if (r < 0)
+ return r;
+ }
+
+ if (sector_size != 0) {
+ r = fdisk_save_user_sector_size(c, /* phy= */ 0, sector_size);
+ if (r < 0)
+ return r;
+ }
+
+ r = fdisk_assign_device(c, FORMAT_PROC_FD_PATH(dir_fd), read_only);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(c);
+ return 0;
+}
+
+int fdisk_partition_get_uuid_as_id128(struct fdisk_partition *p, sd_id128_t *ret) {
+ const char *ids;
+
+ assert(p);
+ assert(ret);
+
+ ids = fdisk_partition_get_uuid(p);
+ if (!ids)
+ return -ENXIO;
+
+ return sd_id128_from_string(ids, ret);
+}
+
+int fdisk_partition_get_type_as_id128(struct fdisk_partition *p, sd_id128_t *ret) {
+ struct fdisk_parttype *pt;
+ const char *pts;
+
+ assert(p);
+ assert(ret);
+
+ pt = fdisk_partition_get_type(p);
+ if (!pt)
+ return -ENXIO;
+
+ pts = fdisk_parttype_get_string(pt);
+ if (!pts)
+ return -ENXIO;
+
+ return sd_id128_from_string(pts, ret);
+}
+
+int fdisk_partition_get_attrs_as_uint64(struct fdisk_partition *pa, uint64_t *ret) {
+ uint64_t flags = 0;
+ const char *a;
+ int r;
+
+ assert(pa);
+ assert(ret);
+
+ /* Retrieve current flags as uint64_t mask */
+
+ a = fdisk_partition_get_attrs(pa);
+ if (!a) {
+ *ret = 0;
+ return 0;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&a, &word, ",", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ if (streq(word, "RequiredPartition"))
+ flags |= SD_GPT_FLAG_REQUIRED_PARTITION;
+ else if (streq(word, "NoBlockIOProtocol"))
+ flags |= SD_GPT_FLAG_NO_BLOCK_IO_PROTOCOL;
+ else if (streq(word, "LegacyBIOSBootable"))
+ flags |= SD_GPT_FLAG_LEGACY_BIOS_BOOTABLE;
+ else {
+ const char *e;
+ unsigned u;
+
+ /* Drop "GUID" prefix if specified */
+ e = startswith(word, "GUID:") ?: word;
+
+ if (safe_atou(e, &u) < 0) {
+ log_debug("Unknown partition flag '%s', ignoring.", word);
+ continue;
+ }
+
+ if (u >= sizeof(flags)*8) { /* partition flags on GPT are 64-bit. Let's ignore any further
+ bits should libfdisk report them */
+ log_debug("Partition flag above bit 63 (%s), ignoring.", word);
+ continue;
+ }
+
+ flags |= UINT64_C(1) << u;
+ }
+ }
+
+ *ret = flags;
+ return 0;
+}
+
+int fdisk_partition_set_attrs_as_uint64(struct fdisk_partition *pa, uint64_t flags) {
+ _cleanup_free_ char *attrs = NULL;
+ int r;
+
+ assert(pa);
+
+ for (unsigned i = 0; i < sizeof(flags) * 8; i++) {
+ if (!FLAGS_SET(flags, UINT64_C(1) << i))
+ continue;
+
+ r = strextendf_with_separator(&attrs, ",", "%u", i);
+ if (r < 0)
+ return r;
+ }
+
+ return fdisk_partition_set_attrs(pa, strempty(attrs));
+}
+
+#endif
diff --git a/src/shared/fdisk-util.h b/src/shared/fdisk-util.h
new file mode 100644
index 0000000..a72a596
--- /dev/null
+++ b/src/shared/fdisk-util.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_LIBFDISK
+
+#include <libfdisk.h>
+
+#include "sd-id128.h"
+
+#include "macro.h"
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_context*, fdisk_unref_context, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_partition*, fdisk_unref_partition, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_parttype*, fdisk_unref_parttype, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct fdisk_table*, fdisk_unref_table, NULL);
+
+int fdisk_new_context_at(int dir_fd, const char *path, bool read_only, uint32_t sector_size, struct fdisk_context **ret);
+
+int fdisk_partition_get_uuid_as_id128(struct fdisk_partition *p, sd_id128_t *ret);
+int fdisk_partition_get_type_as_id128(struct fdisk_partition *p, sd_id128_t *ret);
+
+int fdisk_partition_get_attrs_as_uint64(struct fdisk_partition *pa, uint64_t *ret);
+int fdisk_partition_set_attrs_as_uint64(struct fdisk_partition *pa, uint64_t flags);
+
+#endif
diff --git a/src/shared/fdset.c b/src/shared/fdset.c
new file mode 100644
index 0000000..e5b8e92
--- /dev/null
+++ b/src/shared/fdset.c
@@ -0,0 +1,323 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+
+#include "sd-daemon.h"
+
+#include "alloc-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fdset.h"
+#include "log.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "set.h"
+#include "stat-util.h"
+
+#define MAKE_SET(s) ((Set*) s)
+#define MAKE_FDSET(s) ((FDSet*) s)
+
+FDSet *fdset_new(void) {
+ return MAKE_FDSET(set_new(NULL));
+}
+
+static void fdset_shallow_freep(FDSet **s) {
+ /* Destroys the set, but does not free the fds inside, like fdset_free()! */
+ set_free(MAKE_SET(*ASSERT_PTR(s)));
+}
+
+int fdset_new_array(FDSet **ret, const int fds[], size_t n_fds) {
+ _cleanup_(fdset_shallow_freep) FDSet *s = NULL;
+ int r;
+
+ assert(ret);
+ assert(fds || n_fds == 0);
+
+ s = fdset_new();
+ if (!s)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < n_fds; i++) {
+ r = fdset_put(s, fds[i]);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+void fdset_close(FDSet *s) {
+ void *p;
+
+ while ((p = set_steal_first(MAKE_SET(s)))) {
+ int fd = PTR_TO_FD(p);
+
+ /* Valgrind's fd might have ended up in this set here, due to fdset_new_fill(). We'll ignore
+ * all failures here, so that the EBADFD that valgrind will return us on close() doesn't
+ * influence us */
+
+ /* When reloading duplicates of the private bus connection fds and suchlike are closed here,
+ * which has no effect at all, since they are only duplicates. So don't be surprised about
+ * these log messages. */
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *path = NULL;
+
+ (void) fd_get_path(fd, &path);
+ log_debug("Closing set fd %i (%s)", fd, strna(path));
+ }
+
+ (void) close_nointr(fd);
+ }
+}
+
+FDSet* fdset_free(FDSet *s) {
+ fdset_close(s);
+ set_free(MAKE_SET(s));
+ return NULL;
+}
+
+int fdset_put(FDSet *s, int fd) {
+ assert(s);
+ assert(fd >= 0);
+
+ /* Avoid integer overflow in FD_TO_PTR() */
+ if (fd == INT_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Refusing invalid fd: %d", fd);
+
+ return set_put(MAKE_SET(s), FD_TO_PTR(fd));
+}
+
+int fdset_consume(FDSet *s, int fd) {
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+
+ r = fdset_put(s, fd);
+ if (r < 0)
+ safe_close(fd);
+
+ return r;
+}
+
+int fdset_put_dup(FDSet *s, int fd) {
+ _cleanup_close_ int copy = -EBADF;
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+
+ copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (copy < 0)
+ return -errno;
+
+ r = fdset_put(s, copy);
+ if (r < 0)
+ return r;
+
+ return TAKE_FD(copy);
+}
+
+bool fdset_contains(FDSet *s, int fd) {
+ assert(s);
+ assert(fd >= 0);
+
+ /* Avoid integer overflow in FD_TO_PTR() */
+ if (fd == INT_MAX) {
+ log_debug("Refusing invalid fd: %d", fd);
+ return false;
+ }
+
+ return !!set_get(MAKE_SET(s), FD_TO_PTR(fd));
+}
+
+int fdset_remove(FDSet *s, int fd) {
+ assert(s);
+ assert(fd >= 0);
+
+ /* Avoid integer overflow in FD_TO_PTR() */
+ if (fd == INT_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Refusing invalid fd: %d", fd);
+
+ return set_remove(MAKE_SET(s), FD_TO_PTR(fd)) ? fd : -ENOENT;
+}
+
+int fdset_new_fill(
+ int filter_cloexec, /* if < 0 takes all fds, otherwise only those with O_CLOEXEC set (1) or unset (0) */
+ FDSet **ret) {
+
+ _cleanup_(fdset_shallow_freep) FDSet *s = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ int r;
+
+ assert(ret);
+
+ /* Creates an fdset and fills in all currently open file descriptors. Also set all collected fds
+ * to CLOEXEC. */
+
+ d = opendir("/proc/self/fd");
+ if (!d) {
+ if (errno == ENOENT && proc_mounted() == 0)
+ return -ENOSYS;
+
+ return -errno;
+ }
+
+ s = fdset_new();
+ if (!s)
+ return -ENOMEM;
+
+ FOREACH_DIRENT(de, d, return -errno) {
+ int fd;
+
+ if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
+ continue;
+
+ fd = parse_fd(de->d_name);
+ if (fd < 0)
+ return fd;
+
+ if (fd < 3)
+ continue;
+ if (fd == dirfd(d))
+ continue;
+
+ if (filter_cloexec >= 0) {
+ int fl;
+
+ /* If user asked for that filter by O_CLOEXEC. This is useful so that fds that have
+ * been passed in can be collected and fds which have been created locally can be
+ * ignored, under the assumption that only the latter have O_CLOEXEC set. */
+
+ fl = fcntl(fd, F_GETFD);
+ if (fl < 0)
+ return -errno;
+
+ if (FLAGS_SET(fl, FD_CLOEXEC) != !!filter_cloexec)
+ continue;
+ }
+
+ /* We need to set CLOEXEC manually only if we're collecting non-CLOEXEC fds. */
+ if (filter_cloexec <= 0) {
+ r = fd_cloexec(fd, true);
+ if (r < 0)
+ return r;
+ }
+
+ r = fdset_put(s, fd);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+int fdset_cloexec(FDSet *fds, bool b) {
+ void *p;
+ int r;
+
+ assert(fds);
+
+ SET_FOREACH(p, MAKE_SET(fds)) {
+ r = fd_cloexec(PTR_TO_FD(p), b);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int fdset_new_listen_fds(FDSet **ret, bool unset) {
+ _cleanup_(fdset_shallow_freep) FDSet *s = NULL;
+ int n, fd, r;
+
+ assert(ret);
+
+ /* Creates an fdset and fills in all passed file descriptors */
+
+ s = fdset_new();
+ if (!s)
+ return -ENOMEM;
+
+ n = sd_listen_fds(unset);
+ for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd ++) {
+ r = fdset_put(s, fd);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+int fdset_to_array(FDSet *fds, int **ret) {
+ unsigned j = 0, m;
+ void *e;
+ int *a;
+
+ assert(ret);
+
+ m = fdset_size(fds);
+ if (m > INT_MAX) /* We want to be able to return an "int" */
+ return -ENOMEM;
+ if (m == 0) {
+ *ret = NULL; /* suppress array allocation if empty */
+ return 0;
+ }
+
+ a = new(int, m);
+ if (!a)
+ return -ENOMEM;
+
+ SET_FOREACH(e, MAKE_SET(fds))
+ a[j++] = PTR_TO_FD(e);
+
+ assert(j == m);
+
+ *ret = TAKE_PTR(a);
+ return (int) m;
+}
+
+int fdset_close_others(FDSet *fds) {
+ _cleanup_free_ int *a = NULL;
+ int n;
+
+ n = fdset_to_array(fds, &a);
+ if (n < 0)
+ return n;
+
+ return close_all_fds(a, n);
+}
+
+unsigned fdset_size(FDSet *fds) {
+ return set_size(MAKE_SET(fds));
+}
+
+bool fdset_isempty(FDSet *fds) {
+ return set_isempty(MAKE_SET(fds));
+}
+
+int fdset_iterate(FDSet *s, Iterator *i) {
+ void *p;
+
+ if (!set_iterate(MAKE_SET(s), i, &p))
+ return -ENOENT;
+
+ return PTR_TO_FD(p);
+}
+
+int fdset_steal_first(FDSet *fds) {
+ void *p;
+
+ p = set_steal_first(MAKE_SET(fds));
+ if (!p)
+ return -ENOENT;
+
+ return PTR_TO_FD(p);
+}
diff --git a/src/shared/fdset.h b/src/shared/fdset.h
new file mode 100644
index 0000000..70a764f
--- /dev/null
+++ b/src/shared/fdset.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "hashmap.h"
+#include "macro.h"
+#include "set.h"
+
+typedef struct FDSet FDSet;
+
+FDSet* fdset_new(void);
+FDSet* fdset_free(FDSet *s);
+
+int fdset_put(FDSet *s, int fd);
+int fdset_consume(FDSet *s, int fd);
+int fdset_put_dup(FDSet *s, int fd);
+
+bool fdset_contains(FDSet *s, int fd);
+int fdset_remove(FDSet *s, int fd);
+
+int fdset_new_array(FDSet **ret, const int *fds, size_t n_fds);
+int fdset_new_fill(int filter_cloexec, FDSet **ret);
+int fdset_new_listen_fds(FDSet **ret, bool unset);
+
+int fdset_cloexec(FDSet *fds, bool b);
+
+int fdset_to_array(FDSet *fds, int **ret);
+
+int fdset_close_others(FDSet *fds);
+
+unsigned fdset_size(FDSet *fds);
+bool fdset_isempty(FDSet *fds);
+
+int fdset_iterate(FDSet *s, Iterator *i);
+
+int fdset_steal_first(FDSet *fds);
+
+void fdset_close(FDSet *fds);
+
+#define _FDSET_FOREACH(fd, fds, i) \
+ for (Iterator i = ITERATOR_FIRST; ((fd) = fdset_iterate((fds), &i)) >= 0; )
+#define FDSET_FOREACH(fd, fds) \
+ _FDSET_FOREACH(fd, fds, UNIQ_T(i, UNIQ))
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(FDSet*, fdset_free);
+#define _cleanup_fdset_free_ _cleanup_(fdset_freep)
diff --git a/src/shared/fileio-label.c b/src/shared/fileio-label.c
new file mode 100644
index 0000000..572b8f6
--- /dev/null
+++ b/src/shared/fileio-label.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+
+#include "fileio-label.h"
+#include "fileio.h"
+#include "selinux-util.h"
+
+int write_string_file_atomic_label_ts(const char *fn, const char *line, struct timespec *ts) {
+ int r;
+
+ r = mac_selinux_create_file_prepare(fn, S_IFREG);
+ if (r < 0)
+ return r;
+
+ r = write_string_file_ts(fn, line, WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC, ts);
+
+ mac_selinux_create_file_clear();
+
+ return r;
+}
+
+int create_shutdown_run_nologin_or_warn(void) {
+ int r;
+
+ /* This is used twice: once in systemd-user-sessions.service, in order to block logins when we
+ * actually go down, and once in systemd-logind.service when shutdowns are scheduled, and logins are
+ * to be turned off a bit in advance. We use the same wording of the message in both cases.
+ *
+ * Traditionally, there was only /etc/nologin, and we managed that. Then, in PAM 1.1
+ * support for /run/nologin was added as alternative
+ * (https://github.com/linux-pam/linux-pam/commit/e9e593f6ddeaf975b7fe8446d184e6bc387d450b).
+ * 13 years later we stopped managing /etc/nologin, leaving it for the administrator to manage.
+ */
+
+ r = write_string_file_atomic_label("/run/nologin",
+ "System is going down. Unprivileged users are not permitted to log in anymore. "
+ "For technical details, see pam_nologin(8).");
+ if (r < 0)
+ return log_error_errno(r, "Failed to create /run/nologin: %m");
+
+ return 0;
+}
diff --git a/src/shared/fileio-label.h b/src/shared/fileio-label.h
new file mode 100644
index 0000000..03b4a16
--- /dev/null
+++ b/src/shared/fileio-label.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+/* These functions are split out of fileio.h (and not for example just flags to the functions they wrap) in order to
+ * optimize linking: This way, -lselinux is needed only for the callers of these functions that need selinux, but not
+ * for all */
+
+int write_string_file_atomic_label_ts(const char *fn, const char *line, struct timespec *ts);
+static inline int write_string_file_atomic_label(const char *fn, const char *line) {
+ return write_string_file_atomic_label_ts(fn, line, NULL);
+}
+
+int create_shutdown_run_nologin_or_warn(void);
diff --git a/src/shared/find-esp.c b/src/shared/find-esp.c
new file mode 100644
index 0000000..db87084
--- /dev/null
+++ b/src/shared/find-esp.c
@@ -0,0 +1,909 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <linux/magic.h>
+#include <sys/vfs.h>
+
+#include "sd-device.h"
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "blkid-util.h"
+#include "btrfs-util.h"
+#include "chase.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "find-esp.h"
+#include "gpt.h"
+#include "mount-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "virt.h"
+
+typedef enum VerifyESPFlags {
+ VERIFY_ESP_SEARCHING = 1 << 0, /* Downgrade various "not found" logs to debug level */
+ VERIFY_ESP_UNPRIVILEGED_MODE = 1 << 1, /* Call into udev rather than blkid */
+ VERIFY_ESP_SKIP_FSTYPE_CHECK = 1 << 2, /* Skip filesystem check */
+ VERIFY_ESP_SKIP_DEVICE_CHECK = 1 << 3, /* Skip device node check */
+} VerifyESPFlags;
+
+static VerifyESPFlags verify_esp_flags_init(int unprivileged_mode, const char *env_name_for_relaxing) {
+ VerifyESPFlags flags = 0;
+ int r;
+
+ assert(env_name_for_relaxing);
+
+ if (unprivileged_mode < 0)
+ unprivileged_mode = geteuid() != 0;
+ if (unprivileged_mode)
+ flags |= VERIFY_ESP_UNPRIVILEGED_MODE;
+
+ r = getenv_bool(env_name_for_relaxing);
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $%s environment variable, assuming false.", env_name_for_relaxing);
+ else if (r > 0)
+ flags |= VERIFY_ESP_SKIP_FSTYPE_CHECK | VERIFY_ESP_SKIP_DEVICE_CHECK;
+
+ if (detect_container() > 0)
+ flags |= VERIFY_ESP_SKIP_DEVICE_CHECK;
+
+ return flags;
+}
+
+static int verify_esp_blkid(
+ dev_t devid,
+ VerifyESPFlags flags,
+ uint32_t *ret_part,
+ uint64_t *ret_pstart,
+ uint64_t *ret_psize,
+ sd_id128_t *ret_uuid) {
+
+ sd_id128_t uuid = SD_ID128_NULL;
+ uint64_t pstart = 0, psize = 0;
+ uint32_t part = 0;
+
+#if HAVE_BLKID
+ _cleanup_(blkid_free_probep) blkid_probe b = NULL;
+ _cleanup_free_ char *node = NULL;
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING);
+ const char *v;
+ int r;
+
+ r = devname_from_devnum(S_IFBLK, devid, &node);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get device path for " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(devid));
+
+ errno = 0;
+ b = blkid_new_probe_from_filename(node);
+ if (!b)
+ return log_error_errno(errno ?: SYNTHETIC_ERRNO(ENOMEM), "Failed to open file system \"%s\": %m", node);
+
+ blkid_probe_enable_superblocks(b, 1);
+ blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
+ blkid_probe_enable_partitions(b, 1);
+ blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
+
+ errno = 0;
+ r = blkid_do_safeprobe(b);
+ if (r == -2)
+ return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system \"%s\" is ambiguous.", node);
+ if (r == 1)
+ return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "File system \"%s\" does not contain a label.", node);
+ if (r != 0)
+ return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe file system \"%s\": %m", node);
+
+ r = blkid_probe_lookup_value(b, "TYPE", &v, NULL);
+ if (r != 0)
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "No filesystem found on \"%s\": %m", node);
+ if (!streq(v, "vfat"))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" is not FAT.", node);
+
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_SCHEME", &v, NULL);
+ if (r != 0)
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" is not located on a partitioned block device.", node);
+ if (!streq(v, "gpt"))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" is not on a GPT partition table.", node);
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_TYPE", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno ?: EIO, "Failed to probe partition type UUID of \"%s\": %m", node);
+ if (sd_id128_string_equal(v, SD_GPT_ESP) <= 0)
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" has wrong type for an EFI System Partition (ESP).", node);
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_UUID", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition entry UUID of \"%s\": %m", node);
+ r = sd_id128_from_string(v, &uuid);
+ if (r < 0)
+ return log_error_errno(r, "Partition \"%s\" has invalid UUID \"%s\".", node, v);
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_NUMBER", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition number of \"%s\": %m", node);
+ r = safe_atou32(v, &part);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse PART_ENTRY_NUMBER field.");
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_OFFSET", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition offset of \"%s\": %m", node);
+ r = safe_atou64(v, &pstart);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse PART_ENTRY_OFFSET field.");
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_SIZE", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno ?: SYNTHETIC_ERRNO(EIO), "Failed to probe partition size of \"%s\": %m", node);
+ r = safe_atou64(v, &psize);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse PART_ENTRY_SIZE field.");
+#endif
+
+ if (ret_part)
+ *ret_part = part;
+ if (ret_pstart)
+ *ret_pstart = pstart;
+ if (ret_psize)
+ *ret_psize = psize;
+ if (ret_uuid)
+ *ret_uuid = uuid;
+
+ return 0;
+}
+
+static int verify_esp_udev(
+ dev_t devid,
+ VerifyESPFlags flags,
+ uint32_t *ret_part,
+ uint64_t *ret_pstart,
+ uint64_t *ret_psize,
+ sd_id128_t *ret_uuid) {
+
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING);
+ _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+ sd_id128_t uuid = SD_ID128_NULL;
+ uint64_t pstart = 0, psize = 0;
+ uint32_t part = 0;
+ const char *node, *v;
+ int r;
+
+ r = sd_device_new_from_devnum(&d, 'b', devid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get device from device number: %m");
+
+ r = sd_device_get_devname(d, &node);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device node: %m");
+
+ r = sd_device_get_property_value(d, "ID_FS_TYPE", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device property: %m");
+ if (!streq(v, "vfat"))
+ return log_device_full_errno(d,
+ searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" is not FAT.", node );
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_SCHEME", &v);
+ if (r < 0)
+ return log_device_full_errno(d,
+ searching && r == -ENOENT ? LOG_DEBUG : LOG_ERR,
+ searching && r == -ENOENT ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : r,
+ "Failed to get device property: %m");
+ if (!streq(v, "gpt"))
+ return log_device_full_errno(d,
+ searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" is not on a GPT partition table.", node);
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device property: %m");
+ if (sd_id128_string_equal(v, SD_GPT_ESP) <= 0)
+ return log_device_full_errno(d,
+ searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" has wrong type for an EFI System Partition (ESP).", node);
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_UUID", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device property: %m");
+ r = sd_id128_from_string(v, &uuid);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Partition \"%s\" has invalid UUID \"%s\".", node, v);
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_NUMBER", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device property: %m");
+ r = safe_atou32(v, &part);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to parse PART_ENTRY_NUMBER field.");
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_OFFSET", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device property: %m");
+ r = safe_atou64(v, &pstart);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to parse PART_ENTRY_OFFSET field.");
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_SIZE", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device property: %m");
+ r = safe_atou64(v, &psize);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to parse PART_ENTRY_SIZE field.");
+
+ if (ret_part)
+ *ret_part = part;
+ if (ret_pstart)
+ *ret_pstart = pstart;
+ if (ret_psize)
+ *ret_psize = psize;
+ if (ret_uuid)
+ *ret_uuid = uuid;
+
+ return 0;
+}
+
+static int verify_fsroot_dir(
+ int dir_fd,
+ const char *path,
+ VerifyESPFlags flags,
+ dev_t *ret_dev) {
+
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING),
+ unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE);
+ _cleanup_free_ char *f = NULL;
+ STRUCT_NEW_STATX_DEFINE(sxa);
+ STRUCT_NEW_STATX_DEFINE(sxb);
+ int r;
+
+ /* Checks if the specified directory is at the root of its file system, and returns device
+ * major/minor of the device, if it is. */
+
+ assert(dir_fd >= 0);
+ assert(path);
+
+ /* We pass the full path from the root directory file descriptor so we can use it for logging, but
+ * dir_fd points to the parent directory of the final component of the given path, so we extract the
+ * filename and operate on that. */
+
+ r = path_extract_filename(path, &f);
+ if (r < 0 && r != -EADDRNOTAVAIL)
+ return log_error_errno(r, "Failed to extract filename of %s: %m", path);
+
+ r = statx_fallback(dir_fd, strempty(f), AT_SYMLINK_NOFOLLOW|(isempty(f) ? AT_EMPTY_PATH : 0),
+ STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxa.sx);
+ if (r < 0)
+ return log_full_errno((searching && r == -ENOENT) ||
+ (unprivileged_mode && ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_ERR, r,
+ "Failed to determine block device node of \"%s\": %m", path);
+
+ assert(S_ISDIR(sxa.sx.stx_mode)); /* We used O_DIRECTORY above, when opening, so this must hold */
+
+ if (FLAGS_SET(sxa.sx.stx_attributes_mask, STATX_ATTR_MOUNT_ROOT)) {
+
+ /* If we have STATX_ATTR_MOUNT_ROOT, we are happy, that's all we need. We operate under the
+ * assumption that a top of a mount point is also the top of the file system. (Which of
+ * course is strictly speaking not always true...) */
+
+ if (!FLAGS_SET(sxa.sx.stx_attributes, STATX_ATTR_MOUNT_ROOT))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "Directory \"%s\" is not the root of the file system.", path);
+
+ goto success;
+ }
+
+ /* Now let's look at the parent */
+ r = statx_fallback(dir_fd, "", AT_EMPTY_PATH, STATX_TYPE|STATX_INO|STATX_MNT_ID, &sxb.sx);
+ if (r < 0)
+ return log_full_errno(unprivileged_mode && ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_ERR, r,
+ "Failed to determine block device node of parent of \"%s\": %m", path);
+
+ if (statx_inode_same(&sxa.sx, &sxb.sx)) /* for the root dir inode nr for both inodes will be the same */
+ goto success;
+
+ if (statx_mount_same(&sxa.nsx, &sxb.nsx))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "Directory \"%s\" is not the root of the file system.", path);
+
+success:
+ if (!ret_dev)
+ return 0;
+
+ if (sxa.sx.stx_dev_major == 0) /* Hmm, maybe a btrfs device, and the caller asked for the backing device? Then let's try to get it. */
+ return btrfs_get_block_device_at(dir_fd, strempty(f), ret_dev);
+
+ *ret_dev = makedev(sxa.sx.stx_dev_major, sxa.sx.stx_dev_minor);
+ return 0;
+}
+
+static int verify_esp(
+ int rfd,
+ const char *path,
+ char **ret_path,
+ uint32_t *ret_part,
+ uint64_t *ret_pstart,
+ uint64_t *ret_psize,
+ sd_id128_t *ret_uuid,
+ dev_t *ret_devid,
+ VerifyESPFlags flags) {
+
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING),
+ unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE);
+ _cleanup_free_ char *p = NULL;
+ _cleanup_close_ int pfd = -EBADF;
+ dev_t devid = 0;
+ int r;
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+ assert(path);
+
+ /* This logs about all errors, except:
+ *
+ * -ENOENT → if 'searching' is set, and the dir doesn't exist
+ * -EADDRNOTAVAIL → if 'searching' is set, and the dir doesn't look like an ESP
+ * -EACESS → if 'unprivileged_mode' is set, and we have trouble accessing the thing
+ */
+
+ /* Non-root user can only check the status, so if an error occurred in the following, it does not cause any
+ * issues. Let's also, silence the error messages. */
+
+ r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT|CHASE_PARENT, &p, &pfd);
+ if (r < 0)
+ return log_full_errno((searching && r == -ENOENT) ||
+ (unprivileged_mode && ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_ERR,
+ r, "Failed to open parent directory of \"%s\": %m", path);
+
+ if (!FLAGS_SET(flags, VERIFY_ESP_SKIP_FSTYPE_CHECK)) {
+ _cleanup_free_ char *f = NULL;
+ struct statfs sfs;
+
+ r = path_extract_filename(p, &f);
+ if (r < 0 && r != -EADDRNOTAVAIL)
+ return log_error_errno(r, "Failed to extract filename of %s: %m", p);
+
+ /* Trigger any automounts so that xstatfsat() operates on the mount instead of the mountpoint
+ * directory. */
+ r = trigger_automount_at(pfd, f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to trigger automount at %s: %m", p);
+
+ r = xstatfsat(pfd, strempty(f), &sfs);
+ if (r < 0)
+ /* If we are searching for the mount point, don't generate a log message if we can't find the path */
+ return log_full_errno((searching && r == -ENOENT) ||
+ (unprivileged_mode && r == -EACCES) ? LOG_DEBUG : LOG_ERR, r,
+ "Failed to check file system type of \"%s\": %m", p);
+
+ if (!F_TYPE_EQUAL(sfs.f_type, MSDOS_SUPER_MAGIC))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "File system \"%s\" is not a FAT EFI System Partition (ESP) file system.", p);
+ }
+
+ r = verify_fsroot_dir(pfd, p, flags, FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK) ? NULL : &devid);
+ if (r < 0)
+ return r;
+
+ /* In a container we don't have access to block devices, skip this part of the verification, we trust
+ * the container manager set everything up correctly on its own. */
+ if (FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK))
+ goto finish;
+
+ if (devnum_is_zero(devid))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "Could not determine backing block device of directory \"%s\" (btrfs RAID?).", p);
+
+ /* If we are unprivileged we ask udev for the metadata about the partition. If we are privileged we
+ * use blkid instead. Why? Because this code is called from 'bootctl' which is pretty much an
+ * emergency recovery tool that should also work when udev isn't up (i.e. from the emergency shell),
+ * however blkid can't work if we have no privileges to access block devices directly, which is why
+ * we use udev in that case. */
+ if (unprivileged_mode)
+ r = verify_esp_udev(devid, flags, ret_part, ret_pstart, ret_psize, ret_uuid);
+ else
+ r = verify_esp_blkid(devid, flags, ret_part, ret_pstart, ret_psize, ret_uuid);
+ if (r < 0)
+ return r;
+
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_devid)
+ *ret_devid = devid;
+
+ return 0;
+
+finish:
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_part)
+ *ret_part = 0;
+ if (ret_pstart)
+ *ret_pstart = 0;
+ if (ret_psize)
+ *ret_psize = 0;
+ if (ret_uuid)
+ *ret_uuid = SD_ID128_NULL;
+ if (ret_devid)
+ *ret_devid = 0;
+
+ return 0;
+}
+
+int find_esp_and_warn_at(
+ int rfd,
+ const char *path,
+ int unprivileged_mode,
+ char **ret_path,
+ uint32_t *ret_part,
+ uint64_t *ret_pstart,
+ uint64_t *ret_psize,
+ sd_id128_t *ret_uuid,
+ dev_t *ret_devid) {
+
+ VerifyESPFlags flags;
+ int r;
+
+ /* This logs about all errors except:
+ *
+ * -ENOKEY → when we can't find the partition
+ * -EACCESS → when unprivileged_mode is true, and we can't access something
+ */
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+
+ flags = verify_esp_flags_init(unprivileged_mode, "SYSTEMD_RELAX_ESP_CHECKS");
+
+ if (path)
+ return verify_esp(rfd, path, ret_path, ret_part, ret_pstart, ret_psize, ret_uuid, ret_devid, flags);
+
+ path = getenv("SYSTEMD_ESP_PATH");
+ if (path) {
+ _cleanup_free_ char *p = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+
+ if (!path_is_valid(path) || !path_is_absolute(path))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "$SYSTEMD_ESP_PATH does not refer to an absolute path, refusing to use it: %s",
+ path);
+
+ r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT, &p, &fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve path %s: %m", path);
+
+ /* Note: when the user explicitly configured things with an env var we won't validate the
+ * path beyond checking it refers to a directory. After all we want this to be useful for
+ * testing. */
+
+ if (fstat(fd, &st) < 0)
+ return log_error_errno(errno, "Failed to stat '%s': %m", p);
+ if (!S_ISDIR(st.st_mode))
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "ESP path '%s' is not a directory.", p);
+
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_part)
+ *ret_part = 0;
+ if (ret_pstart)
+ *ret_pstart = 0;
+ if (ret_psize)
+ *ret_psize = 0;
+ if (ret_uuid)
+ *ret_uuid = SD_ID128_NULL;
+ if (ret_devid)
+ *ret_devid = st.st_dev;
+
+ return 0;
+ }
+
+ FOREACH_STRING(dir, "/efi", "/boot", "/boot/efi") {
+ r = verify_esp(rfd, dir, ret_path, ret_part, ret_pstart, ret_psize, ret_uuid, ret_devid,
+ flags | VERIFY_ESP_SEARCHING);
+ if (r >= 0)
+ return 0;
+ if (!IN_SET(r, -ENOENT, -EADDRNOTAVAIL, -ENOTDIR, -ENOTTY)) /* This one is not it */
+ return r;
+ }
+
+ /* No logging here */
+ return -ENOKEY;
+}
+
+int find_esp_and_warn(
+ const char *root,
+ const char *path,
+ int unprivileged_mode,
+ char **ret_path,
+ uint32_t *ret_part,
+ uint64_t *ret_pstart,
+ uint64_t *ret_psize,
+ sd_id128_t *ret_uuid,
+ dev_t *ret_devid) {
+
+ _cleanup_close_ int rfd = -EBADF;
+ _cleanup_free_ char *p = NULL;
+ uint32_t part;
+ uint64_t pstart, psize;
+ sd_id128_t uuid;
+ dev_t devid;
+ int r;
+
+ rfd = open(empty_to_root(root), O_PATH|O_DIRECTORY|O_CLOEXEC);
+ if (rfd < 0)
+ return -errno;
+
+ r = find_esp_and_warn_at(rfd, path, unprivileged_mode,
+ ret_path ? &p : NULL,
+ ret_part ? &part : NULL,
+ ret_pstart ? &pstart : NULL,
+ ret_psize ? &psize : NULL,
+ ret_uuid ? &uuid : NULL,
+ ret_devid ? &devid : NULL);
+ if (r < 0)
+ return r;
+
+ if (ret_path) {
+ r = chaseat_prefix_root(p, root, ret_path);
+ if (r < 0)
+ return r;
+ }
+ if (ret_part)
+ *ret_part = part;
+ if (ret_pstart)
+ *ret_pstart = pstart;
+ if (ret_psize)
+ *ret_psize = psize;
+ if (ret_uuid)
+ *ret_uuid = uuid;
+ if (ret_devid)
+ *ret_devid = devid;
+
+ return 0;
+}
+
+static int verify_xbootldr_blkid(
+ dev_t devid,
+ VerifyESPFlags flags,
+ sd_id128_t *ret_uuid) {
+
+ sd_id128_t uuid = SD_ID128_NULL;
+
+#if HAVE_BLKID
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING);
+ _cleanup_(blkid_free_probep) blkid_probe b = NULL;
+ _cleanup_free_ char *node = NULL;
+ const char *type, *v;
+ int r;
+
+ r = devname_from_devnum(S_IFBLK, devid, &node);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get block device path for " DEVNUM_FORMAT_STR ": %m",
+ DEVNUM_FORMAT_VAL(devid));
+
+ errno = 0;
+ b = blkid_new_probe_from_filename(node);
+ if (!b)
+ return log_error_errno(errno_or_else(ENOMEM), "%s: Failed to create blkid probe: %m", node);
+
+ blkid_probe_enable_partitions(b, 1);
+ blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
+
+ errno = 0;
+ r = blkid_do_safeprobe(b);
+ if (r == _BLKID_SAFEPROBE_AMBIGUOUS)
+ return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "%s: File system is ambiguous.", node);
+ if (r == _BLKID_SAFEPROBE_NOT_FOUND)
+ return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "%s: File system does not contain a label.", node);
+ if (r == _BLKID_SAFEPROBE_ERROR)
+ return log_error_errno(errno_or_else(EIO), "%s: Failed to probe file system: %m", node);
+
+ assert(r == _BLKID_SAFEPROBE_FOUND);
+
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_SCHEME", &type, NULL);
+ if (r != 0)
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(EIO),
+ "%s: Failed to probe PART_ENTRY_SCHEME: %m", node);
+ if (streq(type, "gpt")) {
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_TYPE", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno_or_else(EIO), "%s: Failed to probe PART_ENTRY_TYPE: %m", node);
+ if (sd_id128_string_equal(v, SD_GPT_XBOOTLDR) <= 0)
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV),
+ "%s: Partition has wrong PART_ENTRY_TYPE=%s for XBOOTLDR partition.", node, v);
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_UUID", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno_or_else(EIO), "%s: Failed to probe PART_ENTRY_UUID: %m", node);
+ r = sd_id128_from_string(v, &uuid);
+ if (r < 0)
+ return log_error_errno(r, "%s: Partition has invalid UUID PART_ENTRY_TYPE=%s: %m", node, v);
+
+ } else if (streq(type, "dos")) {
+
+ errno = 0;
+ r = blkid_probe_lookup_value(b, "PART_ENTRY_TYPE", &v, NULL);
+ if (r != 0)
+ return log_error_errno(errno_or_else(EIO), "%s: Failed to probe PART_ENTRY_TYPE: %m", node);
+ if (!streq(v, "0xea"))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV),
+ "%s: Wrong PART_ENTRY_TYPE=%s for XBOOTLDR partition.", node, v);
+
+ } else
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV),
+ "%s: Not on a GPT or DOS partition table (PART_ENTRY_SCHEME=%s).", node, type);
+#endif
+
+ if (ret_uuid)
+ *ret_uuid = uuid;
+
+ return 0;
+}
+
+static int verify_xbootldr_udev(
+ dev_t devid,
+ VerifyESPFlags flags,
+ sd_id128_t *ret_uuid) {
+
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING);
+ _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+ sd_id128_t uuid = SD_ID128_NULL;
+ const char *node, *type, *v;
+ int r;
+
+ r = sd_device_new_from_devnum(&d, 'b', devid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get block device for " DEVNUM_FORMAT_STR ": %m", DEVNUM_FORMAT_VAL(devid));
+
+ r = sd_device_get_devname(d, &node);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to get device node: %m");
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_SCHEME", &type);
+ if (r < 0)
+ return log_device_full_errno(d,
+ searching && r == -ENOENT ? LOG_DEBUG : LOG_ERR,
+ searching && r == -ENOENT ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : r,
+ "Failed to query ID_PART_ENTRY_SCHEME: %m");
+
+ if (streq(type, "gpt")) {
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to query ID_PART_ENTRY_TYPE: %m");
+
+ r = sd_id128_string_equal(v, SD_GPT_XBOOTLDR);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to parse ID_PART_ENTRY_TYPE=%s: %m", v);
+ if (r == 0)
+ return log_device_full_errno(
+ d,
+ searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV),
+ "Partition has wrong ID_PART_ENTRY_TYPE=%s for XBOOTLDR partition.", v);
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_UUID", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to query ID_PART_ENTRY_UUID: %m");
+ r = sd_id128_from_string(v, &uuid);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Partition has invalid UUID ID_PART_ENTRY_TYPE=%s: %m", v);
+
+ } else if (streq(type, "dos")) {
+
+ r = sd_device_get_property_value(d, "ID_PART_ENTRY_TYPE", &v);
+ if (r < 0)
+ return log_device_error_errno(d, r, "Failed to query ID_PART_ENTRY_TYPE: %m");
+ if (!streq(v, "0xea"))
+ return log_device_full_errno(
+ d,
+ searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV),
+ "Wrong ID_PART_ENTRY_TYPE=%s for XBOOTLDR partition.", v);
+
+ } else
+ return log_device_full_errno(
+ d,
+ searching ? LOG_DEBUG : LOG_ERR,
+ searching ? SYNTHETIC_ERRNO(EADDRNOTAVAIL) : SYNTHETIC_ERRNO(ENODEV),
+ "Not on a GPT or DOS partition table (ID_PART_ENTRY_SCHEME=%s).", type);
+
+ if (ret_uuid)
+ *ret_uuid = uuid;
+
+ return 0;
+}
+
+static int verify_xbootldr(
+ int rfd,
+ const char *path,
+ VerifyESPFlags flags,
+ char **ret_path,
+ sd_id128_t *ret_uuid,
+ dev_t *ret_devid) {
+
+ _cleanup_free_ char *p = NULL;
+ _cleanup_close_ int pfd = -EBADF;
+ bool searching = FLAGS_SET(flags, VERIFY_ESP_SEARCHING),
+ unprivileged_mode = FLAGS_SET(flags, VERIFY_ESP_UNPRIVILEGED_MODE);
+ dev_t devid = 0;
+ int r;
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+ assert(path);
+
+ r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT|CHASE_PARENT, &p, &pfd);
+ if (r < 0)
+ return log_full_errno((searching && r == -ENOENT) ||
+ (unprivileged_mode && ERRNO_IS_PRIVILEGE(r)) ? LOG_DEBUG : LOG_ERR,
+ r, "Failed to open parent directory of \"%s\": %m", path);
+
+ r = verify_fsroot_dir(pfd, p, flags, FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK) ? NULL : &devid);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, VERIFY_ESP_SKIP_DEVICE_CHECK))
+ goto finish;
+
+ if (devnum_is_zero(devid))
+ return log_full_errno(searching ? LOG_DEBUG : LOG_ERR,
+ SYNTHETIC_ERRNO(searching ? EADDRNOTAVAIL : ENODEV),
+ "Could not determine backing block device of directory \"%s\" (btrfs RAID?).%s",
+ p,
+ searching ? "" :
+ "\nHint: set $SYSTEMD_RELAX_XBOOTLDR_CHECKS=yes environment variable "
+ "to bypass this and further verifications for the directory.");
+
+ if (unprivileged_mode)
+ r = verify_xbootldr_udev(devid, flags, ret_uuid);
+ else
+ r = verify_xbootldr_blkid(devid, flags, ret_uuid);
+ if (r < 0)
+ return r;
+
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_devid)
+ *ret_devid = devid;
+
+ return 0;
+
+finish:
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_uuid)
+ *ret_uuid = SD_ID128_NULL;
+ if (ret_devid)
+ *ret_devid = 0;
+
+ return 0;
+}
+
+int find_xbootldr_and_warn_at(
+ int rfd,
+ const char *path,
+ int unprivileged_mode,
+ char **ret_path,
+ sd_id128_t *ret_uuid,
+ dev_t *ret_devid) {
+
+ VerifyESPFlags flags;
+ int r;
+
+ /* Similar to find_esp_and_warn(), but finds the XBOOTLDR partition. Returns the same errors. */
+
+ assert(rfd >= 0 || rfd == AT_FDCWD);
+
+ flags = verify_esp_flags_init(unprivileged_mode, "SYSTEMD_RELAX_XBOOTLDR_CHECKS");
+
+ if (path)
+ return verify_xbootldr(rfd, path, flags, ret_path, ret_uuid, ret_devid);
+
+ path = getenv("SYSTEMD_XBOOTLDR_PATH");
+ if (path) {
+ _cleanup_free_ char *p = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+
+ if (!path_is_valid(path) || !path_is_absolute(path))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "$SYSTEMD_XBOOTLDR_PATH does not refer to an absolute path, refusing to use it: %s",
+ path);
+
+ r = chaseat(rfd, path, CHASE_AT_RESOLVE_IN_ROOT, &p, &fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve path %s: %m", p);
+
+ if (fstat(fd, &st) < 0)
+ return log_error_errno(errno, "Failed to stat '%s': %m", p);
+ if (!S_ISDIR(st.st_mode))
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "XBOOTLDR path '%s' is not a directory.", p);
+
+ if (ret_path)
+ *ret_path = TAKE_PTR(p);
+ if (ret_uuid)
+ *ret_uuid = SD_ID128_NULL;
+ if (ret_devid)
+ *ret_devid = st.st_dev;
+
+ return 0;
+ }
+
+ r = verify_xbootldr(rfd, "/boot", flags | VERIFY_ESP_SEARCHING, ret_path, ret_uuid, ret_devid);
+ if (r < 0) {
+ if (!IN_SET(r, -ENOENT, -EADDRNOTAVAIL, -ENOTDIR, -ENOTTY)) /* This one is not it */
+ return r;
+
+ return -ENOKEY;
+ }
+
+ return 0;
+}
+
+int find_xbootldr_and_warn(
+ const char *root,
+ const char *path,
+ int unprivileged_mode,
+ char **ret_path,
+ sd_id128_t *ret_uuid,
+ dev_t *ret_devid) {
+
+ _cleanup_close_ int rfd = -EBADF;
+ _cleanup_free_ char *p = NULL;
+ sd_id128_t uuid;
+ dev_t devid;
+ int r;
+
+ rfd = open(empty_to_root(root), O_PATH|O_DIRECTORY|O_CLOEXEC);
+ if (rfd < 0)
+ return -errno;
+
+ r = find_xbootldr_and_warn_at(rfd, path, unprivileged_mode,
+ ret_path ? &p : NULL,
+ ret_uuid ? &uuid : NULL,
+ ret_devid ? &devid : NULL);
+ if (r < 0)
+ return r;
+
+ if (ret_path) {
+ r = chaseat_prefix_root(p, root, ret_path);
+ if (r < 0)
+ return r;
+ }
+ if (ret_uuid)
+ *ret_uuid = uuid;
+ if (ret_devid)
+ *ret_devid = devid;
+
+ return 0;
+}
diff --git a/src/shared/find-esp.h b/src/shared/find-esp.h
new file mode 100644
index 0000000..2e132a7
--- /dev/null
+++ b/src/shared/find-esp.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "sd-id128.h"
+
+int find_esp_and_warn_at(int rfd, const char *path, int unprivileged_mode, char **ret_path, uint32_t *ret_part, uint64_t *ret_pstart, uint64_t *ret_psize, sd_id128_t *ret_uuid, dev_t *ret_devid);
+int find_esp_and_warn(const char *root, const char *path, int unprivileged_mode, char **ret_path, uint32_t *ret_part, uint64_t *ret_pstart, uint64_t *ret_psize, sd_id128_t *ret_uuid, dev_t *ret_devid);
+
+int find_xbootldr_and_warn_at(int rfd, const char *path, int unprivileged_mode, char **ret_path, sd_id128_t *ret_uuid, dev_t *ret_devid);
+int find_xbootldr_and_warn(const char *root, const char *path, int unprivileged_mode, char **ret_path, sd_id128_t *ret_uuid, dev_t *ret_devid);
diff --git a/src/shared/firewall-util-iptables.c b/src/shared/firewall-util-iptables.c
new file mode 100644
index 0000000..b70b740
--- /dev/null
+++ b/src/shared/firewall-util-iptables.c
@@ -0,0 +1,392 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+/* Temporary work-around for broken glibc vs. linux kernel header definitions
+ * This is already fixed upstream, remove this when distributions have updated.
+ */
+#define _NET_IF_H 1
+
+#include <arpa/inet.h>
+#include <endian.h>
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+#include <net/if.h>
+#ifndef IFNAMSIZ
+#define IFNAMSIZ 16
+#endif
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter/nf_nat.h>
+#include <linux/netfilter/xt_addrtype.h>
+#include <libiptc/libiptc.h>
+
+#include "alloc-util.h"
+#include "dlfcn-util.h"
+#include "firewall-util.h"
+#include "firewall-util-private.h"
+#include "in-addr-util.h"
+#include "macro.h"
+#include "socket-util.h"
+
+static DLSYM_FUNCTION(iptc_check_entry);
+static DLSYM_FUNCTION(iptc_commit);
+static DLSYM_FUNCTION(iptc_delete_entry);
+static DLSYM_FUNCTION(iptc_free);
+static DLSYM_FUNCTION(iptc_init);
+static DLSYM_FUNCTION(iptc_insert_entry);
+static DLSYM_FUNCTION(iptc_strerror);
+
+static void *iptc_dl = NULL;
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct xtc_handle*, sym_iptc_free, NULL);
+
+static int entry_fill_basics(
+ struct ipt_entry *entry,
+ int protocol,
+ const char *in_interface,
+ const union in_addr_union *source,
+ unsigned source_prefixlen,
+ const char *out_interface,
+ const union in_addr_union *destination,
+ unsigned destination_prefixlen) {
+
+ assert(entry);
+
+ if (out_interface && !ifname_valid(out_interface))
+ return -EINVAL;
+ if (in_interface && !ifname_valid(in_interface))
+ return -EINVAL;
+
+ entry->ip.proto = protocol;
+
+ if (in_interface) {
+ size_t l;
+
+ l = strlen(in_interface);
+ assert(l < sizeof entry->ip.iniface);
+ assert(l < sizeof entry->ip.iniface_mask);
+
+ strcpy(entry->ip.iniface, in_interface);
+ memset(entry->ip.iniface_mask, 0xFF, l + 1);
+ }
+ if (source) {
+ entry->ip.src = source->in;
+ in4_addr_prefixlen_to_netmask(&entry->ip.smsk, source_prefixlen);
+ }
+
+ if (out_interface) {
+ size_t l = strlen(out_interface);
+ assert(l < sizeof entry->ip.outiface);
+ assert(l < sizeof entry->ip.outiface_mask);
+
+ strcpy(entry->ip.outiface, out_interface);
+ memset(entry->ip.outiface_mask, 0xFF, l + 1);
+ }
+ if (destination) {
+ entry->ip.dst = destination->in;
+ in4_addr_prefixlen_to_netmask(&entry->ip.dmsk, destination_prefixlen);
+ }
+
+ return 0;
+}
+
+int fw_iptables_add_masquerade(
+ bool add,
+ int af,
+ const union in_addr_union *source,
+ unsigned source_prefixlen) {
+
+ static const xt_chainlabel chain = "POSTROUTING";
+ _cleanup_(sym_iptc_freep) struct xtc_handle *h = NULL;
+ struct ipt_entry *entry, *mask;
+ struct ipt_entry_target *t;
+ size_t sz;
+ struct nf_nat_ipv4_multi_range_compat *mr;
+ int r, protocol = 0;
+ const char *out_interface = NULL;
+ const union in_addr_union *destination = NULL;
+ unsigned destination_prefixlen = 0;
+
+ if (af != AF_INET)
+ return -EOPNOTSUPP;
+
+ if (!source || source_prefixlen == 0)
+ return -EINVAL;
+
+ r = fw_iptables_init_nat(&h);
+ if (r < 0)
+ return r;
+
+ sz = XT_ALIGN(sizeof(struct ipt_entry)) +
+ XT_ALIGN(sizeof(struct ipt_entry_target)) +
+ XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat));
+
+ /* Put together the entry we want to add or remove */
+ entry = alloca0(sz);
+ entry->next_offset = sz;
+ entry->target_offset = XT_ALIGN(sizeof(struct ipt_entry));
+ r = entry_fill_basics(entry, protocol, NULL, source, source_prefixlen, out_interface, destination, destination_prefixlen);
+ if (r < 0)
+ return r;
+
+ /* Fill in target part */
+ t = ipt_get_target(entry);
+ t->u.target_size =
+ XT_ALIGN(sizeof(struct ipt_entry_target)) +
+ XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat));
+ strncpy(t->u.user.name, "MASQUERADE", sizeof(t->u.user.name));
+ mr = (struct nf_nat_ipv4_multi_range_compat*) t->data;
+ mr->rangesize = 1;
+
+ /* Create a search mask entry */
+ mask = alloca_safe(sz);
+ memset(mask, 0xFF, sz);
+
+ if (add) {
+ if (sym_iptc_check_entry(chain, entry, (unsigned char*) mask, h))
+ return 0;
+ if (errno != ENOENT) /* if other error than not existing yet, fail */
+ return -errno;
+
+ if (!sym_iptc_insert_entry(chain, entry, 0, h))
+ return -errno;
+ } else {
+ if (!sym_iptc_delete_entry(chain, entry, (unsigned char*) mask, h)) {
+ if (errno == ENOENT) /* if it's already gone, all is good! */
+ return 0;
+
+ return -errno;
+ }
+ }
+
+ if (!sym_iptc_commit(h))
+ return -errno;
+
+ return 0;
+}
+
+int fw_iptables_add_local_dnat(
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote) {
+
+ static const xt_chainlabel chain_pre = "PREROUTING", chain_output = "OUTPUT";
+ _cleanup_(sym_iptc_freep) struct xtc_handle *h = NULL;
+ struct ipt_entry *entry, *mask;
+ struct ipt_entry_target *t;
+ struct ipt_entry_match *m;
+ struct xt_addrtype_info_v1 *at;
+ struct nf_nat_ipv4_multi_range_compat *mr;
+ size_t sz, msz;
+ int r;
+ const char *in_interface = NULL;
+ const union in_addr_union *source = NULL;
+ unsigned source_prefixlen = 0;
+ const union in_addr_union *destination = NULL;
+ unsigned destination_prefixlen = 0;
+
+ assert(add || !previous_remote);
+
+ if (af != AF_INET)
+ return -EOPNOTSUPP;
+
+ if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
+ return -EOPNOTSUPP;
+
+ if (local_port <= 0)
+ return -EINVAL;
+
+ if (remote_port <= 0)
+ return -EINVAL;
+
+ r = fw_iptables_init_nat(&h);
+ if (r < 0)
+ return r;
+
+ sz = XT_ALIGN(sizeof(struct ipt_entry)) +
+ XT_ALIGN(sizeof(struct ipt_entry_match)) +
+ XT_ALIGN(sizeof(struct xt_addrtype_info_v1)) +
+ XT_ALIGN(sizeof(struct ipt_entry_target)) +
+ XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat));
+
+ if (protocol == IPPROTO_TCP)
+ msz = XT_ALIGN(sizeof(struct ipt_entry_match)) +
+ XT_ALIGN(sizeof(struct xt_tcp));
+ else
+ msz = XT_ALIGN(sizeof(struct ipt_entry_match)) +
+ XT_ALIGN(sizeof(struct xt_udp));
+
+ sz += msz;
+
+ /* Fill in basic part */
+ entry = alloca0(sz);
+ entry->next_offset = sz;
+ entry->target_offset =
+ XT_ALIGN(sizeof(struct ipt_entry)) +
+ XT_ALIGN(sizeof(struct ipt_entry_match)) +
+ XT_ALIGN(sizeof(struct xt_addrtype_info_v1)) +
+ msz;
+ r = entry_fill_basics(entry, protocol, in_interface, source, source_prefixlen, NULL, destination, destination_prefixlen);
+ if (r < 0)
+ return r;
+
+ /* Fill in first match */
+ m = (struct ipt_entry_match*) ((uint8_t*) entry + XT_ALIGN(sizeof(struct ipt_entry)));
+ m->u.match_size = msz;
+ if (protocol == IPPROTO_TCP) {
+ struct xt_tcp *tcp;
+
+ strncpy(m->u.user.name, "tcp", sizeof(m->u.user.name));
+ tcp = (struct xt_tcp*) m->data;
+ tcp->dpts[0] = tcp->dpts[1] = local_port;
+ tcp->spts[0] = 0;
+ tcp->spts[1] = 0xFFFF;
+
+ } else {
+ struct xt_udp *udp;
+
+ strncpy(m->u.user.name, "udp", sizeof(m->u.user.name));
+ udp = (struct xt_udp*) m->data;
+ udp->dpts[0] = udp->dpts[1] = local_port;
+ udp->spts[0] = 0;
+ udp->spts[1] = 0xFFFF;
+ }
+
+ /* Fill in second match */
+ m = (struct ipt_entry_match*) ((uint8_t*) entry + XT_ALIGN(sizeof(struct ipt_entry)) + msz);
+ m->u.match_size =
+ XT_ALIGN(sizeof(struct ipt_entry_match)) +
+ XT_ALIGN(sizeof(struct xt_addrtype_info_v1));
+ strncpy(m->u.user.name, "addrtype", sizeof(m->u.user.name));
+ m->u.user.revision = 1;
+ at = (struct xt_addrtype_info_v1*) m->data;
+ at->dest = XT_ADDRTYPE_LOCAL;
+
+ /* Fill in target part */
+ t = ipt_get_target(entry);
+ t->u.target_size =
+ XT_ALIGN(sizeof(struct ipt_entry_target)) +
+ XT_ALIGN(sizeof(struct nf_nat_ipv4_multi_range_compat));
+ strncpy(t->u.user.name, "DNAT", sizeof(t->u.user.name));
+ mr = (struct nf_nat_ipv4_multi_range_compat*) t->data;
+ mr->rangesize = 1;
+ mr->range[0].flags = NF_NAT_RANGE_PROTO_SPECIFIED|NF_NAT_RANGE_MAP_IPS;
+ mr->range[0].min_ip = mr->range[0].max_ip = remote->in.s_addr;
+ if (protocol == IPPROTO_TCP)
+ mr->range[0].min.tcp.port = mr->range[0].max.tcp.port = htobe16(remote_port);
+ else
+ mr->range[0].min.udp.port = mr->range[0].max.udp.port = htobe16(remote_port);
+
+ mask = alloca0(sz);
+ memset(mask, 0xFF, sz);
+
+ if (add) {
+ /* Add the PREROUTING rule, if it is missing so far */
+ if (!sym_iptc_check_entry(chain_pre, entry, (unsigned char*) mask, h)) {
+ if (errno != ENOENT)
+ return -EINVAL;
+
+ if (!sym_iptc_insert_entry(chain_pre, entry, 0, h))
+ return -errno;
+ }
+
+ /* If a previous remote is set, remove its entry */
+ if (previous_remote && previous_remote->in.s_addr != remote->in.s_addr) {
+ mr->range[0].min_ip = mr->range[0].max_ip = previous_remote->in.s_addr;
+
+ if (!sym_iptc_delete_entry(chain_pre, entry, (unsigned char*) mask, h)) {
+ if (errno != ENOENT)
+ return -errno;
+ }
+
+ mr->range[0].min_ip = mr->range[0].max_ip = remote->in.s_addr;
+ }
+
+ /* Add the OUTPUT rule, if it is missing so far */
+ if (!in_interface) {
+
+ /* Don't apply onto loopback addresses */
+ if (!destination) {
+ entry->ip.dst.s_addr = htobe32(0x7F000000);
+ entry->ip.dmsk.s_addr = htobe32(0xFF000000);
+ entry->ip.invflags = IPT_INV_DSTIP;
+ }
+
+ if (!sym_iptc_check_entry(chain_output, entry, (unsigned char*) mask, h)) {
+ if (errno != ENOENT)
+ return -errno;
+
+ if (!sym_iptc_insert_entry(chain_output, entry, 0, h))
+ return -errno;
+ }
+
+ /* If a previous remote is set, remove its entry */
+ if (previous_remote && previous_remote->in.s_addr != remote->in.s_addr) {
+ mr->range[0].min_ip = mr->range[0].max_ip = previous_remote->in.s_addr;
+
+ if (!sym_iptc_delete_entry(chain_output, entry, (unsigned char*) mask, h)) {
+ if (errno != ENOENT)
+ return -errno;
+ }
+ }
+ }
+ } else {
+ if (!sym_iptc_delete_entry(chain_pre, entry, (unsigned char*) mask, h)) {
+ if (errno != ENOENT)
+ return -errno;
+ }
+
+ if (!in_interface) {
+ if (!destination) {
+ entry->ip.dst.s_addr = htobe32(0x7F000000);
+ entry->ip.dmsk.s_addr = htobe32(0xFF000000);
+ entry->ip.invflags = IPT_INV_DSTIP;
+ }
+
+ if (!sym_iptc_delete_entry(chain_output, entry, (unsigned char*) mask, h)) {
+ if (errno != ENOENT)
+ return -errno;
+ }
+ }
+ }
+
+ if (!sym_iptc_commit(h))
+ return -errno;
+
+ return 0;
+}
+
+static int dlopen_iptc(void) {
+ return dlopen_many_sym_or_warn(
+ &iptc_dl,
+ "libip4tc.so.2", LOG_DEBUG,
+ DLSYM_ARG(iptc_check_entry),
+ DLSYM_ARG(iptc_commit),
+ DLSYM_ARG(iptc_delete_entry),
+ DLSYM_ARG(iptc_free),
+ DLSYM_ARG(iptc_init),
+ DLSYM_ARG(iptc_insert_entry),
+ DLSYM_ARG(iptc_strerror));
+}
+
+int fw_iptables_init_nat(struct xtc_handle **ret) {
+ _cleanup_(sym_iptc_freep) struct xtc_handle *h = NULL;
+ int r;
+
+ r = dlopen_iptc();
+ if (r < 0)
+ return r;
+
+ h = sym_iptc_init("nat");
+ if (!h)
+ return log_debug_errno(errno, "Failed to init \"nat\" table: %s", sym_iptc_strerror(errno));
+
+ if (ret)
+ *ret = TAKE_PTR(h);
+
+ return 0;
+}
diff --git a/src/shared/firewall-util-nft.c b/src/shared/firewall-util-nft.c
new file mode 100644
index 0000000..fe986ed
--- /dev/null
+++ b/src/shared/firewall-util-nft.c
@@ -0,0 +1,1372 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+#include <endian.h>
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+#include <linux/netfilter/nf_tables.h>
+#include <linux/netfilter/nf_nat.h>
+#include <linux/netfilter_ipv4.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+
+#include "sd-netlink.h"
+
+#include "alloc-util.h"
+#include "escape.h"
+#include "extract-word.h"
+#include "firewall-util.h"
+#include "firewall-util-private.h"
+#include "in-addr-util.h"
+#include "macro.h"
+#include "netlink-internal.h"
+#include "netlink-util.h"
+#include "socket-util.h"
+#include "string-table.h"
+#include "time-util.h"
+
+#define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
+#define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
+#define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
+
+#define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
+
+#define UDP_DPORT_OFFSET 2
+
+static sd_netlink_message **netlink_message_unref_many(sd_netlink_message **m) {
+ if (!m)
+ return NULL;
+
+ /* This does not free array. The end of the array must be NULL. */
+
+ for (sd_netlink_message **p = m; *p; p++)
+ *p = sd_netlink_message_unref(*p);
+
+ return m;
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(sd_netlink_message**, netlink_message_unref_many);
+
+static int nfnl_open_expr_container(sd_netlink_message *m, const char *name) {
+ int r;
+
+ assert(m);
+ assert(name);
+
+ r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
+ if (r < 0)
+ return r;
+
+ return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name);
+}
+
+static int nfnl_close_expr_container(sd_netlink_message *m) {
+ int r;
+
+ assert(m);
+
+ r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
+ if (r < 0)
+ return r;
+
+ return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
+}
+
+static int nfnl_add_expr_fib(
+ sd_netlink_message *m,
+ uint32_t nft_fib_flags,
+ enum nft_fib_result result,
+ enum nft_registers dreg) {
+
+ int r;
+
+ assert(m);
+
+ r = nfnl_open_expr_container(m, "fib");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg));
+ if (r < 0)
+ return r;
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_meta(
+ sd_netlink_message *m,
+ enum nft_meta_keys key,
+ enum nft_registers dreg) {
+
+ int r;
+
+ assert(m);
+
+ r = nfnl_open_expr_container(m, "meta");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg));
+ if (r < 0)
+ return r;
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_payload(
+ sd_netlink_message *m,
+ enum nft_payload_bases pb,
+ uint32_t offset,
+ uint32_t len,
+ enum nft_registers dreg) {
+
+ int r;
+
+ assert(m);
+
+ r = nfnl_open_expr_container(m, "payload");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len));
+ if (r < 0)
+ return r;
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_lookup(
+ sd_netlink_message *m,
+ const char *set_name,
+ enum nft_registers sreg,
+ enum nft_registers dreg) {
+
+ int r;
+
+ assert(m);
+ assert(set_name);
+
+ r = nfnl_open_expr_container(m, "lookup");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg));
+ if (r < 0)
+ return r;
+
+ if (dreg != 0) {
+ r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg));
+ if (r < 0)
+ return r;
+ }
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_cmp(
+ sd_netlink_message *m,
+ enum nft_cmp_ops cmp_op,
+ enum nft_registers sreg,
+ const void *data,
+ size_t dlen) {
+
+ int r;
+
+ assert(m);
+ assert(data);
+
+ r = nfnl_open_expr_container(m, "cmp");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_container_data(m, NFTA_CMP_DATA, NFTA_DATA_VALUE, data, dlen);
+ if (r < 0)
+ return r;
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_bitwise(
+ sd_netlink_message *m,
+ enum nft_registers sreg,
+ enum nft_registers dreg,
+ const void *and,
+ const void *xor,
+ uint32_t len) {
+
+ int r;
+
+ assert(m);
+ assert(and);
+ assert(xor);
+
+ r = nfnl_open_expr_container(m, "bitwise");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_container_data(m, NFTA_BITWISE_MASK, NFTA_DATA_VALUE, and, len);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_container_data(m, NFTA_BITWISE_XOR, NFTA_DATA_VALUE, xor, len);
+ if (r < 0)
+ return r;
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_dnat(
+ sd_netlink_message *m,
+ int family,
+ enum nft_registers areg,
+ enum nft_registers preg) {
+
+ int r;
+
+ assert(m);
+
+ r = nfnl_open_expr_container(m, "nat");
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg));
+ if (r < 0)
+ return r;
+
+ return nfnl_close_expr_container(m);
+}
+
+static int nfnl_add_expr_masq(sd_netlink_message *m) {
+ int r;
+
+ r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq");
+ if (r < 0)
+ return r;
+
+ return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
+}
+
+static int sd_nfnl_message_new_masq_rule(
+ sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int family,
+ const char *chain) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
+
+ assert(nfnl);
+ assert(ret);
+ assert(IN_SET(family, AF_INET, AF_INET6));
+ assert(chain);
+
+ r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
+ if (r < 0)
+ return r;
+
+ /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
+ if (family == AF_INET)
+ r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr),
+ sizeof(uint32_t), NFT_REG32_01);
+ else
+ r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_src.s6_addr),
+ sizeof(struct in6_addr), NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
+ r = nfnl_add_expr_lookup(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01, 0);
+ if (r < 0)
+ return r;
+
+ /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
+ r = nfnl_add_expr_masq(m);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int sd_nfnl_message_new_dnat_rule_pre(
+ sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int family,
+ const char *chain) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ enum nft_registers proto_reg;
+ uint32_t local = RTN_LOCAL;
+ int r;
+
+ /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
+ * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
+
+ assert(nfnl);
+ assert(ret);
+ assert(IN_SET(family, AF_INET, AF_INET6));
+ assert(chain);
+
+ r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
+ if (r < 0)
+ return r;
+
+ /* 1st statement: fib daddr type local */
+ r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ /* 1st statement (cont.): compare RTN_LOCAL */
+ r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local));
+ if (r < 0)
+ return r;
+
+ /* 2nd statement: lookup local port in map, fetch address:dport to map to */
+ r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
+ sizeof(uint16_t), NFT_REG32_02);
+ if (r < 0)
+ return r;
+
+ /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
+ * store address and port for the dnat mapping in REG1/REG2. */
+ r = nfnl_add_expr_lookup(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
+ r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int sd_nfnl_message_new_dnat_rule_out(
+ sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int family,
+ const char *chain) {
+
+ static const uint32_t zero = 0, one = 1;
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ enum nft_registers proto_reg;
+ int r;
+
+ assert(nfnl);
+ assert(ret);
+ assert(IN_SET(family, AF_INET, AF_INET6));
+ assert(chain);
+
+ r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
+ if (r < 0)
+ return r;
+
+ /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
+ if (family == AF_INET) {
+ uint32_t lonet = htobe32(UINT32_C(0x7F000000)), lomask = htobe32(UINT32_C(0xff000000));
+
+ r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr),
+ sizeof(lonet), NFT_REG32_01);
+ if (r < 0)
+ return r;
+ /* 1st statement (cont.): bitops/prefix */
+ r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask));
+ if (r < 0)
+ return r;
+
+ /* 1st statement (cont.): compare reg1 with 127/8 */
+ r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet));
+ } else {
+ struct in6_addr loaddr = IN6ADDR_LOOPBACK_INIT;
+
+ r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_dst.s6_addr),
+ sizeof(loaddr), NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &loaddr, sizeof(loaddr));
+ }
+ if (r < 0)
+ return r;
+
+ /* 2nd statement: meta oif lo */
+ r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ /* 2nd statement (cont.): compare to lo ifindex (1) */
+ r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one));
+ if (r < 0)
+ return r;
+
+ /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
+ r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ /* 3rd statement (cont): store the port number in reg2 */
+ r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
+ sizeof(uint16_t), NFT_REG32_02);
+ if (r < 0)
+ return r;
+
+ /* 3rd statement (cont): use reg1 and reg2 and retrieve
+ * the new destination ip and port number.
+ *
+ * reg1 and reg2 are clobbered and will then contain the new
+ * address/port number. */
+ r = nfnl_add_expr_lookup(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
+ if (r < 0)
+ return r;
+
+ /* 4th statement: dnat connection to address/port retrieved by the
+ * preceding expression. */
+ proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
+ r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int nft_new_set(
+ struct sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int family,
+ const char *set_name,
+ uint32_t set_id,
+ uint32_t flags,
+ uint32_t type,
+ uint32_t klen) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(nfnl);
+ assert(ret);
+ assert(IN_SET(family, AF_INET, AF_INET6));
+ assert(set_name);
+
+ r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen);
+ if (r < 0)
+ return r;
+
+ if (flags != 0) {
+ r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags));
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type));
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return r;
+}
+
+static int nft_new_map(
+ struct sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int family,
+ const char *set_name,
+ uint32_t set_id,
+ uint32_t flags,
+ uint32_t type,
+ uint32_t klen,
+ uint32_t dtype,
+ uint32_t dlen) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(nfnl);
+ assert(ret);
+ assert(IN_SET(family, AF_INET, AF_INET6));
+ assert(set_name);
+
+ r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen));
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int nft_add_element(
+ sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int nfproto,
+ const char *table_name,
+ const char *set_name,
+ const void *key,
+ uint32_t klen,
+ const void *data,
+ uint32_t dlen) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(nfnl);
+ assert(ret);
+ assert(nfproto_is_valid(nfproto));
+ assert(table_name);
+ assert(set_name);
+ assert(key);
+ assert(data || dlen == 0);
+
+
+ /*
+ * Ideally there would be an API that provides:
+ *
+ * 1) an init function to add the main ruleset skeleton
+ * 2) a function that populates the sets with all known address/port pairs to s/dnat for
+ * 3) a function that can remove address/port pairs again.
+ *
+ * At this time, the existing API is used which is built on a
+ * 'add/delete a rule' paradigm.
+ *
+ * This replicated here and each element gets added to the set
+ * one-by-one.
+ */
+ r = sd_nfnl_nft_message_new_setelems(nfnl, &m, /* add = */ true, nfproto, table_name, set_name);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_append_setelem(m, 0, key, klen, data, dlen, 0);
+ if (r < 0)
+ return r;
+
+ /* could theoretically append more set elements to add here */
+
+ r = sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+static int nft_del_element(
+ sd_netlink *nfnl,
+ sd_netlink_message **ret,
+ int nfproto,
+ const char *table_name,
+ const char *set_name,
+ const void *key,
+ uint32_t klen,
+ const void *data,
+ uint32_t dlen) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(nfnl);
+ assert(ret);
+ assert(nfproto_is_valid(nfproto));
+ assert(table_name);
+ assert(set_name);
+ assert(key);
+ assert(data || dlen == 0);
+
+ r = sd_nfnl_nft_message_new_setelems(nfnl, &m, /* add = */ false, nfproto, table_name, set_name);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_append_setelem(m, 0, key, klen, data, dlen, 0);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(m);
+ return 0;
+}
+
+/* This is needed so 'nft' userspace tool can properly format the contents
+ * of the set/map when someone uses 'nft' to inspect their content.
+ *
+ * The values cannot be changed, they are part of the nft tool type identifier ABI. */
+#define TYPE_BITS 6
+
+enum nft_key_types {
+ TYPE_IPADDR = 7,
+ TYPE_IP6ADDR = 8,
+ TYPE_INET_PROTOCOL = 12,
+ TYPE_INET_SERVICE = 13,
+};
+
+static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) {
+ uint32_t type = (uint32_t)a;
+
+ type <<= TYPE_BITS;
+ type |= (uint32_t)b;
+
+ return type;
+}
+
+static int fw_nftables_init_family(sd_netlink *nfnl, int family) {
+ sd_netlink_message *messages[10] = {};
+ _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages;
+ size_t msgcnt = 0, ip_type_size;
+ uint32_t set_id = 0;
+ int ip_type, r;
+
+ assert(nfnl);
+ assert(IN_SET(family, AF_INET, AF_INET6));
+
+ /* Set F_EXCL so table add fails if the table already exists. */
+ r = sd_nfnl_nft_message_new_table(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME,
+ "prerouting", "nat",
+ NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME,
+ "output", "nat",
+ NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME,
+ "postrouting", "nat",
+ NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1);
+ if (r < 0)
+ return r;
+
+ if (family == AF_INET) {
+ ip_type_size = sizeof(uint32_t);
+ ip_type = TYPE_IPADDR;
+ } else {
+ assert(family == AF_INET6);
+ ip_type_size = sizeof(struct in6_addr);
+ ip_type = TYPE_IP6ADDR;
+ }
+ /* set to store ip address ranges we should masquerade for */
+ r = nft_new_set(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size);
+ if (r < 0)
+ return r;
+
+ /*
+ * map to store ip address:port pair to dnat to. elements in concatenation
+ * are rounded up to 4 bytes.
+ *
+ * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
+ * sizeof(uint8_t) + sizeof(uint16_t).
+ */
+ r = nft_new_map(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0,
+ concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2,
+ concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t));
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &messages[msgcnt++], family, "prerouting");
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_message_new_dnat_rule_out(nfnl, &messages[msgcnt++], family, "output");
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_message_new_masq_rule(nfnl, &messages[msgcnt++], family, "postrouting");
+ if (r < 0)
+ return r;
+
+ assert(msgcnt < ELEMENTSOF(messages));
+ r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL);
+ if (r < 0 && r != -EEXIST)
+ return r;
+
+ return 0;
+}
+
+int fw_nftables_init_full(FirewallContext *ctx, bool init_tables) {
+ _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
+ int r;
+
+ assert(ctx);
+ assert(!ctx->nfnl);
+
+ r = sd_nfnl_socket_open(&nfnl);
+ if (r < 0)
+ return r;
+
+ if (init_tables) {
+ r = fw_nftables_init_family(nfnl, AF_INET);
+ if (r < 0)
+ return r;
+
+ if (socket_ipv6_is_supported()) {
+ r = fw_nftables_init_family(nfnl, AF_INET6);
+ if (r < 0)
+ return log_error_errno(r, "Failed to init ipv6 NAT: %m");
+ }
+ }
+
+ ctx->nfnl = TAKE_PTR(nfnl);
+ return 0;
+}
+
+int fw_nftables_init(FirewallContext *ctx) {
+ return fw_nftables_init_full(ctx, /* init_tables= */ true);
+}
+
+void fw_nftables_exit(FirewallContext *ctx) {
+ assert(ctx);
+
+ ctx->nfnl = sd_netlink_unref(ctx->nfnl);
+}
+
+static int nft_message_append_setelem_iprange(
+ sd_netlink_message *m,
+ const union in_addr_union *source,
+ unsigned int prefixlen) {
+
+ uint32_t mask, start, end;
+ unsigned int nplen;
+ int r;
+
+ assert(m);
+ assert(source);
+ assert(prefixlen <= 32);
+
+ nplen = 32 - prefixlen;
+
+ mask = (1U << nplen) - 1U;
+ mask = htobe32(~mask);
+ start = source->in.s_addr & mask;
+
+ r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_append_setelem(m, 0, &start, sizeof(start), NULL, 0, 0);
+ if (r < 0)
+ return r;
+
+ end = be32toh(start) + (1U << nplen);
+ if (end < be32toh(start))
+ end = 0U;
+ end = htobe32(end);
+
+ r = sd_nfnl_nft_message_append_setelem(m, 1, &end, sizeof(end), NULL, 0, NFT_SET_ELEM_INTERVAL_END);
+ if (r < 0)
+ return r;
+
+ return sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */
+}
+
+static int nft_message_append_setelem_ip6range(
+ sd_netlink_message *m,
+ const union in_addr_union *source,
+ unsigned int prefixlen) {
+
+ union in_addr_union start, end;
+ int r;
+
+ assert(m);
+ assert(source);
+
+ r = in_addr_prefix_range(AF_INET6, source, prefixlen, &start, &end);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_append_setelem(m, 0, &start.in6, sizeof(start.in6), NULL, 0, 0);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_append_setelem(m, 1, &end.in6, sizeof(end.in6), NULL, 0, NFT_SET_ELEM_INTERVAL_END);
+ if (r < 0)
+ return r;
+
+ return sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */
+}
+
+int nft_set_element_modify_iprange(
+ FirewallContext *ctx,
+ bool add,
+ int nfproto,
+ int af,
+ const char *table,
+ const char *set,
+ const union in_addr_union *source,
+ unsigned int source_prefixlen) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(ctx->nfnl);
+ assert(IN_SET(af, AF_INET, AF_INET6));
+ assert(nfproto_is_valid(nfproto));
+ assert(table);
+ assert(set);
+
+ if (!source || source_prefixlen == 0)
+ return -EINVAL;
+
+ if (af == AF_INET6 && source_prefixlen < 8)
+ return -EINVAL;
+
+ r = sd_nfnl_nft_message_new_setelems(ctx->nfnl, &m, add, nfproto, table, set);
+ if (r < 0)
+ return r;
+
+ if (af == AF_INET)
+ r = nft_message_append_setelem_iprange(m, source, source_prefixlen);
+ else
+ r = nft_message_append_setelem_ip6range(m, source, source_prefixlen);
+ if (r < 0)
+ return r;
+
+ return sd_nfnl_call_batch(ctx->nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL);
+}
+
+int nft_set_element_modify_ip(
+ FirewallContext *ctx,
+ bool add,
+ int nfproto,
+ int af,
+ const char *table,
+ const char *set,
+ const union in_addr_union *source) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(ctx->nfnl);
+ assert(IN_SET(af, AF_INET, AF_INET6));
+ assert(nfproto_is_valid(nfproto));
+ assert(table);
+ assert(set);
+
+ if (!source)
+ return -EINVAL;
+
+ r = sd_nfnl_nft_message_new_setelems(ctx->nfnl, &m, add, nfproto, table, set);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(m, NFTA_SET_ELEM_LIST_ELEMENTS);
+ if (r < 0)
+ return r;
+
+ r = sd_nfnl_nft_message_append_setelem(m, 0, source, FAMILY_ADDRESS_SIZE(af), NULL, 0, 0);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(m); /* NFTA_SET_ELEM_LIST_ELEMENTS */
+ if (r < 0)
+ return r;
+
+ return sd_nfnl_call_batch(ctx->nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL);
+}
+
+int nft_set_element_modify_any(FirewallContext *ctx, bool add, int nfproto, const char *table, const char *set, const void *element, size_t element_size) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
+ int r;
+
+ assert(ctx);
+ assert(ctx->nfnl);
+ assert(nfproto_is_valid(nfproto));
+ assert(table);
+ assert(set);
+ assert(element);
+
+ if (add)
+ r = nft_add_element(ctx->nfnl, &m, nfproto, table, set, element, element_size, NULL, 0);
+ else
+ r = nft_del_element(ctx->nfnl, &m, nfproto, table, set, element, element_size, NULL, 0);
+ if (r < 0)
+ return r;
+
+ return sd_nfnl_call_batch(ctx->nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL);
+}
+
+static int af_to_nfproto(int af) {
+ assert(IN_SET(af, AF_INET, AF_INET6));
+
+ switch (af) {
+ case AF_INET:
+ return NFPROTO_IPV4;
+ case AF_INET6:
+ return NFPROTO_IPV6;
+ default:
+ assert_not_reached();
+ }
+}
+
+int fw_nftables_add_masquerade(
+ FirewallContext *ctx,
+ bool add,
+ int af,
+ const union in_addr_union *source,
+ unsigned int source_prefixlen) {
+
+ int r;
+
+ assert(ctx);
+ assert(ctx->nfnl);
+ assert(IN_SET(af, AF_INET, AF_INET6));
+
+ if (!socket_ipv6_is_supported() && af == AF_INET6)
+ return -EOPNOTSUPP;
+
+ r = nft_set_element_modify_iprange(ctx, add, af_to_nfproto(af), af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME,
+ source, source_prefixlen);
+ if (r != -ENOENT)
+ return r;
+
+ /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
+ * systemd nat table.
+ *
+ * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
+ * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
+ * 'just work'.
+ *
+ * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
+ *
+ * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
+ * operation.
+ *
+ * Note that this doesn't protect against external sabotage such as a
+ * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
+ * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
+ * non-deleteable except by the 'owning process'. */
+
+ r = fw_nftables_init_family(ctx->nfnl, af);
+ if (r < 0)
+ return r;
+
+ return nft_set_element_modify_iprange(ctx, add, af_to_nfproto(af), af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME,
+ source, source_prefixlen);
+}
+
+static int fw_nftables_add_local_dnat_internal(
+ sd_netlink *nfnl,
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote) {
+
+ sd_netlink_message *messages[3] = {};
+ _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages;
+ static bool ipv6_supported = true;
+ uint32_t data[5], key[2], dlen;
+ size_t msgcnt = 0;
+ int r;
+
+ assert(nfnl);
+ assert(add || !previous_remote);
+ assert(IN_SET(af, AF_INET, AF_INET6));
+
+ if (!ipv6_supported && af == AF_INET6)
+ return -EOPNOTSUPP;
+
+ if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
+ return -EPROTONOSUPPORT;
+
+ if (local_port <= 0)
+ return -EINVAL;
+
+ key[0] = protocol;
+ key[1] = htobe16(local_port);
+
+ if (!remote)
+ return -EOPNOTSUPP;
+
+ if (remote_port <= 0)
+ return -EINVAL;
+
+ if (af == AF_INET) {
+ dlen = 8;
+ data[1] = htobe16(remote_port);
+ } else {
+ assert(af == AF_INET6);
+ dlen = sizeof(data);
+ data[4] = htobe16(remote_port);
+ }
+
+ /* If a previous remote is set, remove its entry */
+ if (add && previous_remote && !in_addr_equal(af, previous_remote, remote)) {
+ if (af == AF_INET)
+ data[0] = previous_remote->in.s_addr;
+ else
+ memcpy(data, &previous_remote->in6, sizeof(previous_remote->in6));
+
+ r = nft_del_element(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_DNAT_MAP_NAME,
+ key, sizeof(key), data, dlen);
+ if (r < 0)
+ return r;
+ }
+
+ if (af == AF_INET)
+ data[0] = remote->in.s_addr;
+ else
+ memcpy(data, &remote->in6, sizeof(remote->in6));
+
+ if (add)
+ r = nft_add_element(nfnl, &messages[msgcnt++], af_to_nfproto(af), NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_DNAT_MAP_NAME,
+ key, sizeof(key), data, dlen);
+ else
+ r = nft_del_element(nfnl, &messages[msgcnt++], af_to_nfproto(af), NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_DNAT_MAP_NAME,
+ key, sizeof(key), data, dlen);
+ if (r < 0)
+ return r;
+
+ assert(msgcnt < ELEMENTSOF(messages));
+ r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL);
+ if (r == -EOVERFLOW && af == AF_INET6) {
+ /* The current implementation of DNAT in systemd requires kernel's
+ * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
+ * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
+ log_debug_errno(r, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
+ ipv6_supported = false;
+ return -EOPNOTSUPP;
+ }
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int fw_nftables_add_local_dnat(
+ FirewallContext *ctx,
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote) {
+
+ int r;
+
+ assert(ctx);
+ assert(ctx->nfnl);
+ assert(IN_SET(af, AF_INET, AF_INET6));
+
+ if (!socket_ipv6_is_supported() && af == AF_INET6)
+ return -EOPNOTSUPP;
+
+ r = fw_nftables_add_local_dnat_internal(ctx->nfnl, add, af, protocol, local_port, remote, remote_port, previous_remote);
+ if (r != -ENOENT)
+ return r;
+
+ /* See comment in fw_nftables_add_masquerade(). */
+ r = fw_nftables_init_family(ctx->nfnl, af);
+ if (r < 0)
+ return r;
+
+ /* table created anew; previous address already gone */
+ return fw_nftables_add_local_dnat_internal(ctx->nfnl, add, af, protocol, local_port, remote, remote_port, NULL);
+}
+
+static const char *const nfproto_table[] = {
+ [NFPROTO_ARP] = "arp",
+ [NFPROTO_BRIDGE] = "bridge",
+ [NFPROTO_INET] = "inet",
+ [NFPROTO_IPV4] = "ip",
+ [NFPROTO_IPV6] = "ip6",
+ [NFPROTO_NETDEV] = "netdev",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(nfproto, int);
+
+static const char *const nft_set_source_table[] = {
+ [NFT_SET_SOURCE_ADDRESS] = "address",
+ [NFT_SET_SOURCE_PREFIX] = "prefix",
+ [NFT_SET_SOURCE_IFINDEX] = "ifindex",
+ [NFT_SET_SOURCE_CGROUP] = "cgroup",
+ [NFT_SET_SOURCE_USER] = "user",
+ [NFT_SET_SOURCE_GROUP] = "group",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(nft_set_source, int);
+
+void nft_set_context_clear(NFTSetContext *s) {
+ assert(s);
+
+ FOREACH_ARRAY(nft_set, s->sets, s->n_sets) {
+ free(nft_set->table);
+ free(nft_set->set);
+ }
+
+ s->n_sets = 0;
+ s->sets = mfree(s->sets);
+}
+
+int nft_set_add(NFTSetContext *s, NFTSetSource source, int nfproto, const char *table, const char *set) {
+ _cleanup_free_ char *table_dup = NULL, *set_dup = NULL;
+
+ assert(s);
+ assert(IN_SET(source, NFT_SET_SOURCE_ADDRESS, NFT_SET_SOURCE_PREFIX, NFT_SET_SOURCE_IFINDEX, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP));
+ assert(nfproto_is_valid(nfproto));
+ assert(table);
+ assert(set);
+
+ table_dup = strdup(table);
+ if (!table_dup)
+ return -ENOMEM;
+
+ set_dup = strdup(set);
+ if (!set_dup)
+ return -ENOMEM;
+
+ if (!GREEDY_REALLOC(s->sets, s->n_sets + 1))
+ return -ENOMEM;
+
+ s->sets[s->n_sets++] = (NFTSet) {
+ .source = source,
+ .nfproto = nfproto,
+ .table = TAKE_PTR(table_dup),
+ .set = TAKE_PTR(set_dup),
+ };
+
+ return 0;
+}
+
+int nft_set_context_dup(const NFTSetContext *src, NFTSetContext *dst) {
+ int r;
+ _cleanup_(nft_set_context_clear) NFTSetContext d = (NFTSetContext) {};
+
+ assert(src);
+ assert(dst);
+
+ FOREACH_ARRAY(nft_set, src->sets, src->n_sets) {
+ r = nft_set_add(&d, nft_set->source, nft_set->nfproto, nft_set->table, nft_set->set);
+ if (r < 0)
+ return r;
+ }
+
+ *dst = TAKE_STRUCT(d);
+
+ return 0;
+}
+
+int config_parse_nft_set(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ NFTSetContext *nft_set_context = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(nft_set_context);
+ assert(IN_SET(ltype, NFT_SET_PARSE_NETWORK, NFT_SET_PARSE_CGROUP));
+
+ if (isempty(rvalue)) {
+ nft_set_context_clear(nft_set_context);
+
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *tuple = NULL, *source_str = NULL, *family_str = NULL, *table = NULL, *set = NULL;
+ const char *q = NULL;
+ int nfproto;
+ NFTSetSource source;
+
+ r = extract_first_word(&p, &tuple, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(rvalue);
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax %s=%s, ignoring: %m", lvalue, strna(esc));
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ q = tuple;
+ r = extract_many_words(&q, ":", EXTRACT_CUNESCAPE, &source_str, &family_str, &table, &set, NULL);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r != 4 || !isempty(q)) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(tuple);
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "Failed to parse NFT set %s, ignoring", strna(esc));
+ }
+
+ assert(source_str);
+ assert(family_str);
+ assert(table);
+ assert(set);
+
+ source = nft_set_source_from_string(source_str);
+ if (source < 0 ||
+ (ltype == NFT_SET_PARSE_NETWORK && !IN_SET(source, NFT_SET_SOURCE_ADDRESS, NFT_SET_SOURCE_PREFIX, NFT_SET_SOURCE_IFINDEX)) ||
+ (ltype == NFT_SET_PARSE_CGROUP && !IN_SET(source, NFT_SET_SOURCE_CGROUP, NFT_SET_SOURCE_USER, NFT_SET_SOURCE_GROUP))) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(source_str);
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown NFT source %s, ignoring", strna(esc));
+ }
+
+ nfproto = nfproto_from_string(family_str);
+ if (nfproto < 0) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(family_str);
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "Unknown NFT protocol family %s, ignoring", strna(esc));
+ }
+
+ if (!nft_identifier_valid(table)) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(table);
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid table name %s, ignoring", strna(esc));
+ }
+
+ if (!nft_identifier_valid(set)) {
+ _cleanup_free_ char *esc = NULL;
+
+ esc = cescape(set);
+ return log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid set name %s, ignoring", strna(esc));
+ }
+
+ r = nft_set_add(nft_set_context, source, nfproto, table, set);
+ if (r < 0)
+ return r;
+ }
+
+ assert_not_reached();
+}
diff --git a/src/shared/firewall-util-private.h b/src/shared/firewall-util-private.h
new file mode 100644
index 0000000..38c8dfc
--- /dev/null
+++ b/src/shared/firewall-util-private.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "sd-netlink.h"
+
+#include "firewall-util.h"
+#include "in-addr-util.h"
+
+typedef enum FirewallBackend {
+ FW_BACKEND_NONE,
+#if HAVE_LIBIPTC
+ FW_BACKEND_IPTABLES,
+#endif
+ FW_BACKEND_NFTABLES,
+ _FW_BACKEND_MAX,
+ _FW_BACKEND_INVALID = -EINVAL,
+} FirewallBackend;
+
+struct FirewallContext {
+ FirewallBackend backend;
+ sd_netlink *nfnl;
+};
+
+const char *firewall_backend_to_string(FirewallBackend b) _const_;
+
+int fw_nftables_init(FirewallContext *ctx);
+int fw_nftables_init_full(FirewallContext *ctx, bool init_tables);
+void fw_nftables_exit(FirewallContext *ctx);
+
+int fw_nftables_add_masquerade(
+ FirewallContext *ctx,
+ bool add,
+ int af,
+ const union in_addr_union *source,
+ unsigned source_prefixlen);
+
+int fw_nftables_add_local_dnat(
+ FirewallContext *ctx,
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote);
+
+#if HAVE_LIBIPTC
+struct xtc_handle;
+
+int fw_iptables_add_masquerade(
+ bool add,
+ int af,
+ const union in_addr_union *source,
+ unsigned source_prefixlen);
+
+int fw_iptables_add_local_dnat(
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote);
+
+int fw_iptables_init_nat(struct xtc_handle **ret);
+#endif
diff --git a/src/shared/firewall-util.c b/src/shared/firewall-util.c
new file mode 100644
index 0000000..e96b24a
--- /dev/null
+++ b/src/shared/firewall-util.c
@@ -0,0 +1,160 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "alloc-util.h"
+#include "firewall-util.h"
+#include "firewall-util-private.h"
+#include "log.h"
+#include "netlink-util.h"
+#include "string-table.h"
+
+static const char * const firewall_backend_table[_FW_BACKEND_MAX] = {
+ [FW_BACKEND_NONE] = "none",
+#if HAVE_LIBIPTC
+ [FW_BACKEND_IPTABLES] = "iptables",
+#endif
+ [FW_BACKEND_NFTABLES] = "nftables",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(firewall_backend, FirewallBackend);
+
+static void firewall_backend_probe(FirewallContext *ctx, bool init_tables) {
+ const char *e;
+
+ assert(ctx);
+
+ if (ctx->backend != _FW_BACKEND_INVALID)
+ return;
+
+ e = secure_getenv("SYSTEMD_FIREWALL_BACKEND");
+ if (e) {
+ if (streq(e, "nftables"))
+ ctx->backend = FW_BACKEND_NFTABLES;
+ else if (streq(e, "iptables"))
+#if HAVE_LIBIPTC
+ ctx->backend = FW_BACKEND_IPTABLES;
+#else
+ log_debug("Unsupported firewall backend requested, ignoring: %s", e);
+#endif
+ else
+ log_debug("Unrecognized $SYSTEMD_FIREWALL_BACKEND value, ignoring: %s", e);
+ }
+
+ if (ctx->backend == _FW_BACKEND_INVALID) {
+
+ if (fw_nftables_init_full(ctx, init_tables) >= 0)
+ ctx->backend = FW_BACKEND_NFTABLES;
+ else
+#if HAVE_LIBIPTC
+ ctx->backend = FW_BACKEND_IPTABLES;
+#else
+ ctx->backend = FW_BACKEND_NONE;
+#endif
+ }
+
+ if (ctx->backend != FW_BACKEND_NONE)
+ log_debug("Using %s as firewall backend.", firewall_backend_to_string(ctx->backend));
+ else
+ log_debug("No firewall backend found.");
+}
+
+int fw_ctx_new_full(FirewallContext **ret, bool init_tables) {
+ _cleanup_free_ FirewallContext *ctx = NULL;
+
+ ctx = new(FirewallContext, 1);
+ if (!ctx)
+ return -ENOMEM;
+
+ *ctx = (FirewallContext) {
+ .backend = _FW_BACKEND_INVALID,
+ };
+
+ firewall_backend_probe(ctx, init_tables);
+
+ *ret = TAKE_PTR(ctx);
+ return 0;
+}
+
+int fw_ctx_new(FirewallContext **ret) {
+ return fw_ctx_new_full(ret, /* init_tables= */ true);
+}
+
+FirewallContext *fw_ctx_free(FirewallContext *ctx) {
+ if (!ctx)
+ return NULL;
+
+ fw_nftables_exit(ctx);
+
+ return mfree(ctx);
+}
+
+size_t fw_ctx_get_reply_callback_count(FirewallContext *ctx) {
+ if (!ctx || !ctx->nfnl)
+ return 0;
+
+ return netlink_get_reply_callback_count(ctx->nfnl);
+}
+
+int fw_add_masquerade(
+ FirewallContext **ctx,
+ bool add,
+ int af,
+ const union in_addr_union *source,
+ unsigned source_prefixlen) {
+
+ int r;
+
+ assert(ctx);
+
+ if (!*ctx) {
+ r = fw_ctx_new(ctx);
+ if (r < 0)
+ return r;
+ }
+
+ switch ((*ctx)->backend) {
+#if HAVE_LIBIPTC
+ case FW_BACKEND_IPTABLES:
+ return fw_iptables_add_masquerade(add, af, source, source_prefixlen);
+#endif
+ case FW_BACKEND_NFTABLES:
+ return fw_nftables_add_masquerade(*ctx, add, af, source, source_prefixlen);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+int fw_add_local_dnat(
+ FirewallContext **ctx,
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote) {
+
+ int r;
+
+ assert(ctx);
+
+ if (!*ctx) {
+ r = fw_ctx_new(ctx);
+ if (r < 0)
+ return r;
+ }
+
+ switch ((*ctx)->backend) {
+#if HAVE_LIBIPTC
+ case FW_BACKEND_IPTABLES:
+ return fw_iptables_add_local_dnat(add, af, protocol, local_port, remote, remote_port, previous_remote);
+#endif
+ case FW_BACKEND_NFTABLES:
+ return fw_nftables_add_local_dnat(*ctx, add, af, protocol, local_port, remote, remote_port, previous_remote);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
diff --git a/src/shared/firewall-util.h b/src/shared/firewall-util.h
new file mode 100644
index 0000000..14e35be
--- /dev/null
+++ b/src/shared/firewall-util.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "conf-parser.h"
+#include "in-addr-util.h"
+
+typedef struct FirewallContext FirewallContext;
+
+int fw_ctx_new(FirewallContext **ret);
+int fw_ctx_new_full(FirewallContext **ret, bool init_tables);
+FirewallContext *fw_ctx_free(FirewallContext *ctx);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(FirewallContext *, fw_ctx_free);
+
+size_t fw_ctx_get_reply_callback_count(FirewallContext *ctx);
+
+int fw_add_masquerade(
+ FirewallContext **ctx,
+ bool add,
+ int af,
+ const union in_addr_union *source,
+ unsigned source_prefixlen);
+
+int fw_add_local_dnat(
+ FirewallContext **ctx,
+ bool add,
+ int af,
+ int protocol,
+ uint16_t local_port,
+ const union in_addr_union *remote,
+ uint16_t remote_port,
+ const union in_addr_union *previous_remote);
+
+typedef enum NFTSetSource {
+ NFT_SET_SOURCE_ADDRESS,
+ NFT_SET_SOURCE_PREFIX,
+ NFT_SET_SOURCE_IFINDEX,
+ NFT_SET_SOURCE_CGROUP,
+ NFT_SET_SOURCE_USER,
+ NFT_SET_SOURCE_GROUP,
+ _NFT_SET_SOURCE_MAX,
+ _NFT_SET_SOURCE_INVALID = -EINVAL,
+} NFTSetSource;
+
+typedef struct NFTSet {
+ NFTSetSource source;
+ int nfproto;
+ char *table;
+ char *set;
+} NFTSet;
+
+typedef struct NFTSetContext {
+ NFTSet *sets;
+ size_t n_sets;
+} NFTSetContext;
+
+void nft_set_context_clear(NFTSetContext *s);
+int nft_set_context_dup(const NFTSetContext *src, NFTSetContext *dst);
+
+const char *nfproto_to_string(int i) _const_;
+int nfproto_from_string(const char *s) _pure_;
+
+const char *nft_set_source_to_string(int i) _const_;
+int nft_set_source_from_string(const char *s) _pure_;
+
+int nft_set_element_modify_iprange(
+ FirewallContext *ctx,
+ bool add,
+ int nfproto,
+ int af,
+ const char *table,
+ const char *set,
+ const union in_addr_union *source,
+ unsigned int source_prefixlen);
+
+int nft_set_element_modify_ip(
+ FirewallContext *ctx,
+ bool add,
+ int nfproto,
+ int af,
+ const char *table,
+ const char *set,
+ const union in_addr_union *source);
+
+int nft_set_element_modify_any(
+ FirewallContext *ctx,
+ bool add,
+ int nfproto,
+ const char *table,
+ const char *set,
+ const void *element,
+ size_t element_size);
+
+int nft_set_add(NFTSetContext *s, NFTSetSource source, int nfproto, const char *table, const char *set);
+
+typedef enum NFTSetParseFlags {
+ NFT_SET_PARSE_NETWORK,
+ NFT_SET_PARSE_CGROUP,
+} NFTSetParseFlags;
+
+CONFIG_PARSER_PROTOTYPE(config_parse_nft_set);
diff --git a/src/shared/format-table.c b/src/shared/format-table.c
new file mode 100644
index 0000000..9a19177
--- /dev/null
+++ b/src/shared/format-table.c
@@ -0,0 +1,3061 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <ctype.h>
+#include <net/if.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "devnum-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-table.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "gunicode.h"
+#include "id128-util.h"
+#include "in-addr-util.h"
+#include "memory-util.h"
+#include "memstream-util.h"
+#include "pager.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "signal-util.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strxcpyx.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "user-util.h"
+#include "utf8.h"
+
+#define DEFAULT_WEIGHT 100
+
+/*
+ A few notes on implementation details:
+
+ - TableCell is a 'fake' structure, it's just used as data type to pass references to specific cell positions in the
+ table. It can be easily converted to an index number and back.
+
+ - TableData is where the actual data is stored: it encapsulates the data and formatting for a specific cell. It's
+ 'pseudo-immutable' and ref-counted. When a cell's data's formatting is to be changed, we duplicate the object if the
+ ref-counting is larger than 1. Note that TableData and its ref-counting is mostly not visible to the outside. The
+ outside only sees Table and TableCell.
+
+ - The Table object stores a simple one-dimensional array of references to TableData objects, one row after the
+ previous one.
+
+ - There's no special concept of a "row" or "column" in the table, and no special concept of the "header" row. It's all
+ derived from the cell index: we know how many cells are to be stored in a row, and can determine the rest from
+ that. The first row is always the header row. If header display is turned off we simply skip outputting the first
+ row. Also, when sorting rows we always leave the first row where it is, as the header shouldn't move.
+
+ - Note because there's no row and no column object some properties that might be appropriate as row/column properties
+ are exposed as cell properties instead. For example, the "weight" of a column (which is used to determine where to
+ add/remove space preferable when expanding/compressing tables horizontally) is actually made the "weight" of a
+ cell. Given that we usually need it per-column though we will calculate the average across every cell of the column
+ instead.
+
+ - To make things easy, when cells are added without any explicit configured formatting, then we'll copy the formatting
+ from the same cell in the previous cell. This is particularly useful for the "weight" of the cell (see above), as
+ this means setting the weight of the cells of the header row will nicely propagate to all cells in the other rows.
+*/
+
+typedef struct TableData {
+ unsigned n_ref;
+ TableDataType type;
+
+ size_t minimum_width; /* minimum width for the column */
+ size_t maximum_width; /* maximum width for the column */
+ size_t formatted_for_width; /* the width we tried to format for */
+ unsigned weight; /* the horizontal weight for this column, in case the table is expanded/compressed */
+ unsigned ellipsize_percent; /* 0 … 100, where to place the ellipsis when compression is needed */
+ unsigned align_percent; /* 0 … 100, where to pad with spaces when expanding is needed. 0: left-aligned, 100: right-aligned */
+
+ bool uppercase; /* Uppercase string on display */
+
+ const char *color; /* ANSI color string to use for this cell. When written to terminal should not move cursor. Will automatically be reset after the cell */
+ const char *rgap_color; /* The ANSI color to use for the gap right of this cell. Usually used to underline entire rows in a gapless fashion */
+ char *url; /* A URL to use for a clickable hyperlink */
+ char *formatted; /* A cached textual representation of the cell data, before ellipsation/alignment */
+
+ union {
+ uint8_t data[0]; /* data is generic array */
+ bool boolean;
+ usec_t timestamp;
+ usec_t timespan;
+ uint64_t size;
+ char string[0];
+ char **strv;
+ int int_val;
+ int8_t int8;
+ int16_t int16;
+ int32_t int32;
+ int64_t int64;
+ unsigned uint_val;
+ uint8_t uint8;
+ uint16_t uint16;
+ uint32_t uint32;
+ uint64_t uint64;
+ int percent; /* we use 'int' as datatype for percent values in order to match the result of parse_percent() */
+ int ifindex;
+ union in_addr_union address;
+ sd_id128_t id128;
+ uid_t uid;
+ gid_t gid;
+ pid_t pid;
+ mode_t mode;
+ dev_t devnum;
+ /* … add more here as we start supporting more cell data types … */
+ };
+} TableData;
+
+static size_t TABLE_CELL_TO_INDEX(TableCell *cell) {
+ size_t i;
+
+ assert(cell);
+
+ i = PTR_TO_SIZE(cell);
+ assert(i > 0);
+
+ return i-1;
+}
+
+static TableCell* TABLE_INDEX_TO_CELL(size_t index) {
+ assert(index != SIZE_MAX);
+ return SIZE_TO_PTR(index + 1);
+}
+
+struct Table {
+ size_t n_columns;
+ size_t n_cells;
+
+ bool header; /* Whether to show the header row? */
+ bool vertical; /* Whether to field names are on the left rather than the first line */
+
+ TableErsatz ersatz; /* What to show when we have an empty cell or an invalid value that cannot be rendered. */
+
+ size_t width; /* If == 0 format this as wide as necessary. If SIZE_MAX format this to console
+ * width or less wide, but not wider. Otherwise the width to format this table in. */
+ size_t cell_height_max; /* Maximum number of lines per cell. (If there are more, ellipsis is shown. If SIZE_MAX then no limit is set, the default. == 0 is not allowed.) */
+
+ TableData **data;
+
+ size_t *display_map; /* List of columns to show (by their index). It's fine if columns are listed multiple times or not at all */
+ size_t n_display_map;
+
+ size_t *sort_map; /* The columns to order rows by, in order of preference. */
+ size_t n_sort_map;
+
+ char **json_fields;
+ size_t n_json_fields;
+
+ bool *reverse_map;
+};
+
+Table *table_new_raw(size_t n_columns) {
+ _cleanup_(table_unrefp) Table *t = NULL;
+
+ assert(n_columns > 0);
+
+ t = new(Table, 1);
+ if (!t)
+ return NULL;
+
+ *t = (struct Table) {
+ .n_columns = n_columns,
+ .header = true,
+ .width = SIZE_MAX,
+ .cell_height_max = SIZE_MAX,
+ .ersatz = TABLE_ERSATZ_EMPTY,
+ };
+
+ return TAKE_PTR(t);
+}
+
+Table *table_new_internal(const char *first_header, ...) {
+ _cleanup_(table_unrefp) Table *t = NULL;
+ size_t n_columns = 1;
+ va_list ap;
+ int r;
+
+ assert(first_header);
+
+ va_start(ap, first_header);
+ for (;;) {
+ if (!va_arg(ap, const char*))
+ break;
+
+ n_columns++;
+ }
+ va_end(ap);
+
+ t = table_new_raw(n_columns);
+ if (!t)
+ return NULL;
+
+ va_start(ap, first_header);
+ for (const char *h = first_header; h; h = va_arg(ap, const char*)) {
+ TableCell *cell;
+
+ r = table_add_cell(t, &cell, TABLE_HEADER, h);
+ if (r < 0) {
+ va_end(ap);
+ return NULL;
+ }
+ }
+ va_end(ap);
+
+ assert(t->n_columns == t->n_cells);
+ return TAKE_PTR(t);
+}
+
+Table *table_new_vertical(void) {
+ _cleanup_(table_unrefp) Table *t = NULL;
+ TableCell *cell;
+
+ t = table_new_raw(2);
+ if (!t)
+ return NULL;
+
+ t->vertical = true;
+ t->header = false;
+
+ if (table_add_cell(t, &cell, TABLE_HEADER, "key") < 0)
+ return NULL;
+
+ if (table_set_align_percent(t, cell, 100) < 0)
+ return NULL;
+
+ if (table_add_cell(t, &cell, TABLE_HEADER, "value") < 0)
+ return NULL;
+
+ if (table_set_align_percent(t, cell, 0) < 0)
+ return NULL;
+
+ return TAKE_PTR(t);
+}
+
+static TableData *table_data_free(TableData *d) {
+ assert(d);
+
+ free(d->formatted);
+ free(d->url);
+
+ if (IN_SET(d->type, TABLE_STRV, TABLE_STRV_WRAPPED))
+ strv_free(d->strv);
+
+ return mfree(d);
+}
+
+DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(TableData, table_data, table_data_free);
+DEFINE_TRIVIAL_CLEANUP_FUNC(TableData*, table_data_unref);
+
+Table *table_unref(Table *t) {
+ if (!t)
+ return NULL;
+
+ for (size_t i = 0; i < t->n_cells; i++)
+ table_data_unref(t->data[i]);
+
+ free(t->data);
+ free(t->display_map);
+ free(t->sort_map);
+ free(t->reverse_map);
+
+ for (size_t i = 0; i < t->n_json_fields; i++)
+ free(t->json_fields[i]);
+
+ free(t->json_fields);
+
+ return mfree(t);
+}
+
+static size_t table_data_size(TableDataType type, const void *data) {
+
+ switch (type) {
+
+ case TABLE_EMPTY:
+ return 0;
+
+ case TABLE_STRING:
+ case TABLE_PATH:
+ case TABLE_PATH_BASENAME:
+ case TABLE_FIELD:
+ case TABLE_HEADER:
+ return strlen(data) + 1;
+
+ case TABLE_STRV:
+ case TABLE_STRV_WRAPPED:
+ return sizeof(char **);
+
+ case TABLE_BOOLEAN_CHECKMARK:
+ case TABLE_BOOLEAN:
+ return sizeof(bool);
+
+ case TABLE_TIMESTAMP:
+ case TABLE_TIMESTAMP_UTC:
+ case TABLE_TIMESTAMP_RELATIVE:
+ case TABLE_TIMESTAMP_RELATIVE_MONOTONIC:
+ case TABLE_TIMESTAMP_LEFT:
+ case TABLE_TIMESTAMP_DATE:
+ case TABLE_TIMESPAN:
+ case TABLE_TIMESPAN_MSEC:
+ case TABLE_TIMESPAN_DAY:
+ return sizeof(usec_t);
+
+ case TABLE_SIZE:
+ case TABLE_INT64:
+ case TABLE_UINT64:
+ case TABLE_UINT64_HEX:
+ case TABLE_BPS:
+ return sizeof(uint64_t);
+
+ case TABLE_INT32:
+ case TABLE_UINT32:
+ case TABLE_UINT32_HEX:
+ return sizeof(uint32_t);
+
+ case TABLE_INT16:
+ case TABLE_UINT16:
+ return sizeof(uint16_t);
+
+ case TABLE_INT8:
+ case TABLE_UINT8:
+ return sizeof(uint8_t);
+
+ case TABLE_INT:
+ case TABLE_UINT:
+ case TABLE_PERCENT:
+ case TABLE_IFINDEX:
+ case TABLE_SIGNAL:
+ return sizeof(int);
+
+ case TABLE_IN_ADDR:
+ return sizeof(struct in_addr);
+
+ case TABLE_IN6_ADDR:
+ return sizeof(struct in6_addr);
+
+ case TABLE_UUID:
+ case TABLE_ID128:
+ return sizeof(sd_id128_t);
+
+ case TABLE_UID:
+ return sizeof(uid_t);
+ case TABLE_GID:
+ return sizeof(gid_t);
+ case TABLE_PID:
+ return sizeof(pid_t);
+
+ case TABLE_MODE:
+ case TABLE_MODE_INODE_TYPE:
+ return sizeof(mode_t);
+
+ case TABLE_DEVNUM:
+ return sizeof(dev_t);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static bool table_data_matches(
+ TableData *d,
+ TableDataType type,
+ const void *data,
+ size_t minimum_width,
+ size_t maximum_width,
+ unsigned weight,
+ unsigned align_percent,
+ unsigned ellipsize_percent,
+ bool uppercase) {
+
+ size_t k, l;
+ assert(d);
+
+ if (d->type != type)
+ return false;
+
+ if (d->minimum_width != minimum_width)
+ return false;
+
+ if (d->maximum_width != maximum_width)
+ return false;
+
+ if (d->weight != weight)
+ return false;
+
+ if (d->align_percent != align_percent)
+ return false;
+
+ if (d->ellipsize_percent != ellipsize_percent)
+ return false;
+
+ if (d->uppercase != uppercase)
+ return false;
+
+ /* If a color/url is set, refuse to merge */
+ if (d->color || d->rgap_color)
+ return false;
+ if (d->url)
+ return false;
+
+ k = table_data_size(type, data);
+ l = table_data_size(d->type, d->data);
+ if (k != l)
+ return false;
+
+ return memcmp_safe(data, d->data, l) == 0;
+}
+
+static TableData *table_data_new(
+ TableDataType type,
+ const void *data,
+ size_t minimum_width,
+ size_t maximum_width,
+ unsigned weight,
+ unsigned align_percent,
+ unsigned ellipsize_percent,
+ bool uppercase) {
+
+ _cleanup_free_ TableData *d = NULL;
+ size_t data_size;
+
+ data_size = table_data_size(type, data);
+
+ d = malloc0(offsetof(TableData, data) + data_size);
+ if (!d)
+ return NULL;
+
+ d->n_ref = 1;
+ d->type = type;
+ d->minimum_width = minimum_width;
+ d->maximum_width = maximum_width;
+ d->weight = weight;
+ d->align_percent = align_percent;
+ d->ellipsize_percent = ellipsize_percent;
+ d->uppercase = uppercase;
+
+ if (IN_SET(type, TABLE_STRV, TABLE_STRV_WRAPPED)) {
+ d->strv = strv_copy(data);
+ if (!d->strv)
+ return NULL;
+ } else
+ memcpy_safe(d->data, data, data_size);
+
+ return TAKE_PTR(d);
+}
+
+int table_add_cell_full(
+ Table *t,
+ TableCell **ret_cell,
+ TableDataType type,
+ const void *data,
+ size_t minimum_width,
+ size_t maximum_width,
+ unsigned weight,
+ unsigned align_percent,
+ unsigned ellipsize_percent) {
+
+ _cleanup_(table_data_unrefp) TableData *d = NULL;
+ bool uppercase;
+ TableData *p;
+
+ assert(t);
+ assert(type >= 0);
+ assert(type < _TABLE_DATA_TYPE_MAX);
+
+ /* Special rule: patch NULL data fields to the empty field */
+ if (!data)
+ type = TABLE_EMPTY;
+
+ /* Determine the cell adjacent to the current one, but one row up */
+ if (t->n_cells >= t->n_columns)
+ assert_se(p = t->data[t->n_cells - t->n_columns]);
+ else
+ p = NULL;
+
+ /* If formatting parameters are left unspecified, copy from the previous row */
+ if (minimum_width == SIZE_MAX)
+ minimum_width = p ? p->minimum_width : 1;
+
+ if (weight == UINT_MAX)
+ weight = p ? p->weight : DEFAULT_WEIGHT;
+
+ if (align_percent == UINT_MAX)
+ align_percent = p ? p->align_percent : 0;
+
+ if (ellipsize_percent == UINT_MAX)
+ ellipsize_percent = p ? p->ellipsize_percent : 100;
+
+ assert(align_percent <= 100);
+ assert(ellipsize_percent <= 100);
+
+ uppercase = type == TABLE_HEADER;
+
+ /* Small optimization: Pretty often adjacent cells in two subsequent lines have the same data and
+ * formatting. Let's see if we can reuse the cell data and ref it once more. */
+
+ if (p && table_data_matches(p, type, data, minimum_width, maximum_width, weight, align_percent, ellipsize_percent, uppercase))
+ d = table_data_ref(p);
+ else {
+ d = table_data_new(type, data, minimum_width, maximum_width, weight, align_percent, ellipsize_percent, uppercase);
+ if (!d)
+ return -ENOMEM;
+ }
+
+ if (!GREEDY_REALLOC(t->data, MAX(t->n_cells + 1, t->n_columns)))
+ return -ENOMEM;
+
+ if (ret_cell)
+ *ret_cell = TABLE_INDEX_TO_CELL(t->n_cells);
+
+ t->data[t->n_cells++] = TAKE_PTR(d);
+
+ return 0;
+}
+
+int table_add_cell_stringf_full(Table *t, TableCell **ret_cell, TableDataType dt, const char *format, ...) {
+ _cleanup_free_ char *buffer = NULL;
+ va_list ap;
+ int r;
+
+ assert(t);
+ assert(IN_SET(dt, TABLE_STRING, TABLE_PATH, TABLE_PATH_BASENAME, TABLE_FIELD, TABLE_HEADER));
+
+ va_start(ap, format);
+ r = vasprintf(&buffer, format, ap);
+ va_end(ap);
+ if (r < 0)
+ return -ENOMEM;
+
+ return table_add_cell(t, ret_cell, dt, buffer);
+}
+
+int table_fill_empty(Table *t, size_t until_column) {
+ int r;
+
+ assert(t);
+
+ /* Fill the rest of the current line with empty cells until we reach the specified column. Will add
+ * at least one cell. Pass 0 in order to fill a line to the end or insert an empty line. */
+
+ if (until_column >= t->n_columns)
+ return -EINVAL;
+
+ do {
+ r = table_add_cell(t, NULL, TABLE_EMPTY, NULL);
+ if (r < 0)
+ return r;
+
+ } while ((t->n_cells % t->n_columns) != until_column);
+
+ return 0;
+}
+
+int table_dup_cell(Table *t, TableCell *cell) {
+ size_t i;
+
+ assert(t);
+
+ /* Add the data of the specified cell a second time as a new cell to the end. */
+
+ i = TABLE_CELL_TO_INDEX(cell);
+ if (i >= t->n_cells)
+ return -ENXIO;
+
+ if (!GREEDY_REALLOC(t->data, MAX(t->n_cells + 1, t->n_columns)))
+ return -ENOMEM;
+
+ t->data[t->n_cells++] = table_data_ref(t->data[i]);
+ return 0;
+}
+
+static int table_dedup_cell(Table *t, TableCell *cell) {
+ _cleanup_free_ char *curl = NULL;
+ TableData *nd, *od;
+ size_t i;
+
+ assert(t);
+
+ /* Helper call that ensures the specified cell's data object has a ref count of 1, which we can use before
+ * changing a cell's formatting without effecting every other cell's formatting that shares the same data */
+
+ i = TABLE_CELL_TO_INDEX(cell);
+ if (i >= t->n_cells)
+ return -ENXIO;
+
+ assert_se(od = t->data[i]);
+ if (od->n_ref == 1)
+ return 0;
+
+ assert(od->n_ref > 1);
+
+ if (od->url) {
+ curl = strdup(od->url);
+ if (!curl)
+ return -ENOMEM;
+ }
+
+ nd = table_data_new(
+ od->type,
+ od->data,
+ od->minimum_width,
+ od->maximum_width,
+ od->weight,
+ od->align_percent,
+ od->ellipsize_percent,
+ od->uppercase);
+ if (!nd)
+ return -ENOMEM;
+
+ nd->color = od->color;
+ nd->rgap_color = od->rgap_color;
+ nd->url = TAKE_PTR(curl);
+
+ table_data_unref(od);
+ t->data[i] = nd;
+
+ assert(nd->n_ref == 1);
+
+ return 1;
+}
+
+static TableData *table_get_data(Table *t, TableCell *cell) {
+ size_t i;
+
+ assert(t);
+ assert(cell);
+
+ /* Get the data object of the specified cell, or NULL if it doesn't exist */
+
+ i = TABLE_CELL_TO_INDEX(cell);
+ if (i >= t->n_cells)
+ return NULL;
+
+ assert(t->data[i]);
+ assert(t->data[i]->n_ref > 0);
+
+ return t->data[i];
+}
+
+int table_set_minimum_width(Table *t, TableCell *cell, size_t minimum_width) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ if (minimum_width == SIZE_MAX)
+ minimum_width = 1;
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->minimum_width = minimum_width;
+ return 0;
+}
+
+int table_set_maximum_width(Table *t, TableCell *cell, size_t maximum_width) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->maximum_width = maximum_width;
+ return 0;
+}
+
+int table_set_weight(Table *t, TableCell *cell, unsigned weight) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ if (weight == UINT_MAX)
+ weight = DEFAULT_WEIGHT;
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->weight = weight;
+ return 0;
+}
+
+int table_set_align_percent(Table *t, TableCell *cell, unsigned percent) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ if (percent == UINT_MAX)
+ percent = 0;
+
+ assert(percent <= 100);
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->align_percent = percent;
+ return 0;
+}
+
+int table_set_ellipsize_percent(Table *t, TableCell *cell, unsigned percent) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ if (percent == UINT_MAX)
+ percent = 100;
+
+ assert(percent <= 100);
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->ellipsize_percent = percent;
+ return 0;
+}
+
+int table_set_color(Table *t, TableCell *cell, const char *color) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->color = empty_to_null(color);
+ return 0;
+}
+
+int table_set_rgap_color(Table *t, TableCell *cell, const char *color) {
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ table_get_data(t, cell)->rgap_color = empty_to_null(color);
+ return 0;
+}
+
+int table_set_url(Table *t, TableCell *cell, const char *url) {
+ _cleanup_free_ char *copy = NULL;
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ if (url) {
+ copy = strdup(url);
+ if (!copy)
+ return -ENOMEM;
+ }
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ return free_and_replace(table_get_data(t, cell)->url, copy);
+}
+
+int table_set_uppercase(Table *t, TableCell *cell, bool b) {
+ TableData *d;
+ int r;
+
+ assert(t);
+ assert(cell);
+
+ r = table_dedup_cell(t, cell);
+ if (r < 0)
+ return r;
+
+ assert_se(d = table_get_data(t, cell));
+
+ if (d->uppercase == b)
+ return 0;
+
+ d->formatted = mfree(d->formatted);
+ d->uppercase = b;
+ return 1;
+}
+
+int table_update(Table *t, TableCell *cell, TableDataType type, const void *data) {
+ _cleanup_free_ char *curl = NULL;
+ TableData *nd, *od;
+ size_t i;
+
+ assert(t);
+ assert(cell);
+
+ i = TABLE_CELL_TO_INDEX(cell);
+ if (i >= t->n_cells)
+ return -ENXIO;
+
+ assert_se(od = t->data[i]);
+
+ if (od->url) {
+ curl = strdup(od->url);
+ if (!curl)
+ return -ENOMEM;
+ }
+
+ nd = table_data_new(
+ type,
+ data,
+ od->minimum_width,
+ od->maximum_width,
+ od->weight,
+ od->align_percent,
+ od->ellipsize_percent,
+ od->uppercase);
+ if (!nd)
+ return -ENOMEM;
+
+ nd->color = od->color;
+ nd->rgap_color = od->rgap_color;
+ nd->url = TAKE_PTR(curl);
+
+ table_data_unref(od);
+ t->data[i] = nd;
+
+ return 0;
+}
+
+int table_add_many_internal(Table *t, TableDataType first_type, ...) {
+ TableCell *last_cell = NULL;
+ va_list ap;
+ int r;
+
+ assert(t);
+ assert(first_type >= 0);
+ assert(first_type < _TABLE_DATA_TYPE_MAX);
+
+ va_start(ap, first_type);
+
+ for (TableDataType type = first_type;; type = va_arg(ap, TableDataType)) {
+ const void *data;
+ union {
+ uint64_t size;
+ usec_t usec;
+ int int_val;
+ int8_t int8;
+ int16_t int16;
+ int32_t int32;
+ int64_t int64;
+ unsigned uint_val;
+ uint8_t uint8;
+ uint16_t uint16;
+ uint32_t uint32;
+ uint64_t uint64;
+ int percent;
+ int ifindex;
+ bool b;
+ union in_addr_union address;
+ sd_id128_t id128;
+ uid_t uid;
+ gid_t gid;
+ pid_t pid;
+ mode_t mode;
+ dev_t devnum;
+ } buffer;
+
+ switch (type) {
+
+ case TABLE_EMPTY:
+ data = NULL;
+ break;
+
+ case TABLE_STRING:
+ case TABLE_PATH:
+ case TABLE_PATH_BASENAME:
+ case TABLE_FIELD:
+ case TABLE_HEADER:
+ data = va_arg(ap, const char *);
+ break;
+
+ case TABLE_STRV:
+ case TABLE_STRV_WRAPPED:
+ data = va_arg(ap, char * const *);
+ break;
+
+ case TABLE_BOOLEAN_CHECKMARK:
+ case TABLE_BOOLEAN:
+ buffer.b = va_arg(ap, int);
+ data = &buffer.b;
+ break;
+
+ case TABLE_TIMESTAMP:
+ case TABLE_TIMESTAMP_UTC:
+ case TABLE_TIMESTAMP_RELATIVE:
+ case TABLE_TIMESTAMP_RELATIVE_MONOTONIC:
+ case TABLE_TIMESTAMP_LEFT:
+ case TABLE_TIMESTAMP_DATE:
+ case TABLE_TIMESPAN:
+ case TABLE_TIMESPAN_MSEC:
+ case TABLE_TIMESPAN_DAY:
+ buffer.usec = va_arg(ap, usec_t);
+ data = &buffer.usec;
+ break;
+
+ case TABLE_SIZE:
+ case TABLE_BPS:
+ buffer.size = va_arg(ap, uint64_t);
+ data = &buffer.size;
+ break;
+
+ case TABLE_INT:
+ case TABLE_SIGNAL:
+ buffer.int_val = va_arg(ap, int);
+ data = &buffer.int_val;
+ break;
+
+ case TABLE_INT8: {
+ int x = va_arg(ap, int);
+ assert(x >= INT8_MIN && x <= INT8_MAX);
+
+ buffer.int8 = x;
+ data = &buffer.int8;
+ break;
+ }
+
+ case TABLE_INT16: {
+ int x = va_arg(ap, int);
+ assert(x >= INT16_MIN && x <= INT16_MAX);
+
+ buffer.int16 = x;
+ data = &buffer.int16;
+ break;
+ }
+
+ case TABLE_INT32:
+ buffer.int32 = va_arg(ap, int32_t);
+ data = &buffer.int32;
+ break;
+
+ case TABLE_INT64:
+ buffer.int64 = va_arg(ap, int64_t);
+ data = &buffer.int64;
+ break;
+
+ case TABLE_UINT:
+ buffer.uint_val = va_arg(ap, unsigned);
+ data = &buffer.uint_val;
+ break;
+
+ case TABLE_UINT8: {
+ unsigned x = va_arg(ap, unsigned);
+ assert(x <= UINT8_MAX);
+
+ buffer.uint8 = x;
+ data = &buffer.uint8;
+ break;
+ }
+
+ case TABLE_UINT16: {
+ unsigned x = va_arg(ap, unsigned);
+ assert(x <= UINT16_MAX);
+
+ buffer.uint16 = x;
+ data = &buffer.uint16;
+ break;
+ }
+
+ case TABLE_UINT32:
+ case TABLE_UINT32_HEX:
+ buffer.uint32 = va_arg(ap, uint32_t);
+ data = &buffer.uint32;
+ break;
+
+ case TABLE_UINT64:
+ case TABLE_UINT64_HEX:
+ buffer.uint64 = va_arg(ap, uint64_t);
+ data = &buffer.uint64;
+ break;
+
+ case TABLE_PERCENT:
+ buffer.percent = va_arg(ap, int);
+ data = &buffer.percent;
+ break;
+
+ case TABLE_IFINDEX:
+ buffer.ifindex = va_arg(ap, int);
+ data = &buffer.ifindex;
+ break;
+
+ case TABLE_IN_ADDR:
+ buffer.address = *va_arg(ap, union in_addr_union *);
+ data = &buffer.address.in;
+ break;
+
+ case TABLE_IN6_ADDR:
+ buffer.address = *va_arg(ap, union in_addr_union *);
+ data = &buffer.address.in6;
+ break;
+
+ case TABLE_UUID:
+ case TABLE_ID128:
+ buffer.id128 = va_arg(ap, sd_id128_t);
+ data = &buffer.id128;
+ break;
+
+ case TABLE_UID:
+ buffer.uid = va_arg(ap, uid_t);
+ data = &buffer.uid;
+ break;
+
+ case TABLE_GID:
+ buffer.gid = va_arg(ap, gid_t);
+ data = &buffer.gid;
+ break;
+
+ case TABLE_PID:
+ buffer.pid = va_arg(ap, pid_t);
+ data = &buffer.pid;
+ break;
+
+ case TABLE_MODE:
+ case TABLE_MODE_INODE_TYPE:
+ buffer.mode = va_arg(ap, mode_t);
+ data = &buffer.mode;
+ break;
+
+ case TABLE_DEVNUM:
+ buffer.devnum = va_arg(ap, dev_t);
+ data = &buffer.devnum;
+ break;
+
+ case TABLE_SET_MINIMUM_WIDTH: {
+ size_t w = va_arg(ap, size_t);
+
+ r = table_set_minimum_width(t, last_cell, w);
+ goto check;
+ }
+
+ case TABLE_SET_MAXIMUM_WIDTH: {
+ size_t w = va_arg(ap, size_t);
+ r = table_set_maximum_width(t, last_cell, w);
+ goto check;
+ }
+
+ case TABLE_SET_WEIGHT: {
+ unsigned w = va_arg(ap, unsigned);
+ r = table_set_weight(t, last_cell, w);
+ goto check;
+ }
+
+ case TABLE_SET_ALIGN_PERCENT: {
+ unsigned p = va_arg(ap, unsigned);
+ r = table_set_align_percent(t, last_cell, p);
+ goto check;
+ }
+
+ case TABLE_SET_ELLIPSIZE_PERCENT: {
+ unsigned p = va_arg(ap, unsigned);
+ r = table_set_ellipsize_percent(t, last_cell, p);
+ goto check;
+ }
+
+ case TABLE_SET_COLOR: {
+ const char *c = va_arg(ap, const char*);
+ r = table_set_color(t, last_cell, c);
+ goto check;
+ }
+
+ case TABLE_SET_RGAP_COLOR: {
+ const char *c = va_arg(ap, const char*);
+ r = table_set_rgap_color(t, last_cell, c);
+ goto check;
+ }
+
+ case TABLE_SET_BOTH_COLORS: {
+ const char *c = va_arg(ap, const char*);
+
+ r = table_set_color(t, last_cell, c);
+ if (r < 0) {
+ va_end(ap);
+ return r;
+ }
+
+ r = table_set_rgap_color(t, last_cell, c);
+ goto check;
+ }
+
+ case TABLE_SET_URL: {
+ const char *u = va_arg(ap, const char*);
+ r = table_set_url(t, last_cell, u);
+ goto check;
+ }
+
+ case TABLE_SET_UPPERCASE: {
+ int u = va_arg(ap, int);
+ r = table_set_uppercase(t, last_cell, u);
+ goto check;
+ }
+
+ case _TABLE_DATA_TYPE_MAX:
+ /* Used as end marker */
+ va_end(ap);
+ return 0;
+
+ default:
+ assert_not_reached();
+ }
+
+ r = table_add_cell(t, &last_cell, type, data);
+ check:
+ if (r < 0) {
+ va_end(ap);
+ return r;
+ }
+ }
+}
+
+void table_set_header(Table *t, bool b) {
+ assert(t);
+
+ t->header = b;
+}
+
+void table_set_width(Table *t, size_t width) {
+ assert(t);
+
+ t->width = width;
+}
+
+void table_set_cell_height_max(Table *t, size_t height) {
+ assert(t);
+ assert(height >= 1 || height == SIZE_MAX);
+
+ t->cell_height_max = height;
+}
+
+void table_set_ersatz_string(Table *t, TableErsatz ersatz) {
+ assert(t);
+ assert(ersatz >= 0 && ersatz < _TABLE_ERSATZ_MAX);
+
+ t->ersatz = ersatz;
+}
+
+static const char* table_ersatz_string(const Table *t) {
+ switch (t->ersatz) {
+ case TABLE_ERSATZ_EMPTY:
+ return "";
+ case TABLE_ERSATZ_DASH:
+ return "-";
+ case TABLE_ERSATZ_UNSET:
+ return "(unset)";
+ case TABLE_ERSATZ_NA:
+ return "n/a";
+ default:
+ assert_not_reached();
+ }
+}
+
+static int table_set_display_all(Table *t) {
+ size_t *d;
+
+ assert(t);
+
+ /* Initialize the display map to the identity */
+
+ d = reallocarray(t->display_map, t->n_columns, sizeof(size_t));
+ if (!d)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < t->n_columns; i++)
+ d[i] = i;
+
+ t->display_map = d;
+ t->n_display_map = t->n_columns;
+
+ return 0;
+}
+
+int table_set_display_internal(Table *t, size_t first_column, ...) {
+ size_t column;
+ va_list ap;
+
+ assert(t);
+
+ column = first_column;
+
+ va_start(ap, first_column);
+ for (;;) {
+ assert(column < t->n_columns);
+
+ if (!GREEDY_REALLOC(t->display_map, MAX(t->n_columns, t->n_display_map+1))) {
+ va_end(ap);
+ return -ENOMEM;
+ }
+
+ t->display_map[t->n_display_map++] = column;
+
+ column = va_arg(ap, size_t);
+ if (column == SIZE_MAX)
+ break;
+
+ }
+ va_end(ap);
+
+ return 0;
+}
+
+int table_set_sort_internal(Table *t, size_t first_column, ...) {
+ size_t column;
+ va_list ap;
+
+ assert(t);
+
+ column = first_column;
+
+ va_start(ap, first_column);
+ for (;;) {
+ assert(column < t->n_columns);
+
+ if (!GREEDY_REALLOC(t->sort_map, MAX(t->n_columns, t->n_sort_map+1))) {
+ va_end(ap);
+ return -ENOMEM;
+ }
+
+ t->sort_map[t->n_sort_map++] = column;
+
+ column = va_arg(ap, size_t);
+ if (column == SIZE_MAX)
+ break;
+ }
+ va_end(ap);
+
+ return 0;
+}
+
+int table_hide_column_from_display_internal(Table *t, ...) {
+ size_t cur = 0;
+ int r;
+
+ assert(t);
+
+ /* If the display map is empty, initialize it with all available columns */
+ if (!t->display_map) {
+ r = table_set_display_all(t);
+ if (r < 0)
+ return r;
+ }
+
+ for (size_t i = 0; i < t->n_display_map; i++) {
+ bool listed = false;
+ va_list ap;
+
+ va_start(ap, t);
+ for (;;) {
+ size_t column;
+
+ column = va_arg(ap, size_t);
+ if (column == SIZE_MAX)
+ break;
+ if (column == t->display_map[i]) {
+ listed = true;
+ break;
+ }
+ }
+ va_end(ap);
+
+ if (listed)
+ continue;
+
+ t->display_map[cur++] = t->display_map[i];
+ }
+
+ t->n_display_map = cur;
+
+ return 0;
+}
+
+static int cell_data_compare(TableData *a, size_t index_a, TableData *b, size_t index_b) {
+ int r;
+
+ assert(a);
+ assert(b);
+
+ if (a->type == b->type) {
+
+ /* We only define ordering for cells of the same data type. If cells with different data types are
+ * compared we follow the order the cells were originally added in */
+
+ switch (a->type) {
+
+ case TABLE_STRING:
+ case TABLE_FIELD:
+ case TABLE_HEADER:
+ return strcmp(a->string, b->string);
+
+ case TABLE_PATH:
+ case TABLE_PATH_BASENAME:
+ return path_compare(a->string, b->string);
+
+ case TABLE_STRV:
+ case TABLE_STRV_WRAPPED:
+ return strv_compare(a->strv, b->strv);
+
+ case TABLE_BOOLEAN:
+ if (!a->boolean && b->boolean)
+ return -1;
+ if (a->boolean && !b->boolean)
+ return 1;
+ return 0;
+
+ case TABLE_TIMESTAMP:
+ case TABLE_TIMESTAMP_UTC:
+ case TABLE_TIMESTAMP_RELATIVE:
+ case TABLE_TIMESTAMP_RELATIVE_MONOTONIC:
+ case TABLE_TIMESTAMP_LEFT:
+ case TABLE_TIMESTAMP_DATE:
+ return CMP(a->timestamp, b->timestamp);
+
+ case TABLE_TIMESPAN:
+ case TABLE_TIMESPAN_MSEC:
+ case TABLE_TIMESPAN_DAY:
+ return CMP(a->timespan, b->timespan);
+
+ case TABLE_SIZE:
+ case TABLE_BPS:
+ return CMP(a->size, b->size);
+
+ case TABLE_INT:
+ case TABLE_SIGNAL:
+ return CMP(a->int_val, b->int_val);
+
+ case TABLE_INT8:
+ return CMP(a->int8, b->int8);
+
+ case TABLE_INT16:
+ return CMP(a->int16, b->int16);
+
+ case TABLE_INT32:
+ return CMP(a->int32, b->int32);
+
+ case TABLE_INT64:
+ return CMP(a->int64, b->int64);
+
+ case TABLE_UINT:
+ return CMP(a->uint_val, b->uint_val);
+
+ case TABLE_UINT8:
+ return CMP(a->uint8, b->uint8);
+
+ case TABLE_UINT16:
+ return CMP(a->uint16, b->uint16);
+
+ case TABLE_UINT32:
+ case TABLE_UINT32_HEX:
+ return CMP(a->uint32, b->uint32);
+
+ case TABLE_UINT64:
+ case TABLE_UINT64_HEX:
+ return CMP(a->uint64, b->uint64);
+
+ case TABLE_PERCENT:
+ return CMP(a->percent, b->percent);
+
+ case TABLE_IFINDEX:
+ return CMP(a->ifindex, b->ifindex);
+
+ case TABLE_IN_ADDR:
+ return CMP(a->address.in.s_addr, b->address.in.s_addr);
+
+ case TABLE_IN6_ADDR:
+ return memcmp(&a->address.in6, &b->address.in6, FAMILY_ADDRESS_SIZE(AF_INET6));
+
+ case TABLE_UUID:
+ case TABLE_ID128:
+ return memcmp(&a->id128, &b->id128, sizeof(sd_id128_t));
+
+ case TABLE_UID:
+ return CMP(a->uid, b->uid);
+
+ case TABLE_GID:
+ return CMP(a->gid, b->gid);
+
+ case TABLE_PID:
+ return CMP(a->pid, b->pid);
+
+ case TABLE_MODE:
+ case TABLE_MODE_INODE_TYPE:
+ return CMP(a->mode, b->mode);
+
+ case TABLE_DEVNUM:
+ r = CMP(major(a->devnum), major(b->devnum));
+ if (r != 0)
+ return r;
+
+ return CMP(minor(a->devnum), minor(b->devnum));
+
+ default:
+ ;
+ }
+ }
+
+ /* Generic fallback using the original order in which the cells where added. */
+ return CMP(index_a, index_b);
+}
+
+static int table_data_compare(const size_t *a, const size_t *b, Table *t) {
+ int r;
+
+ assert(t);
+ assert(t->sort_map);
+
+ /* Make sure the header stays at the beginning */
+ if (*a < t->n_columns && *b < t->n_columns)
+ return 0;
+ if (*a < t->n_columns)
+ return -1;
+ if (*b < t->n_columns)
+ return 1;
+
+ /* Order other lines by the sorting map */
+ for (size_t i = 0; i < t->n_sort_map; i++) {
+ TableData *d, *dd;
+
+ d = t->data[*a + t->sort_map[i]];
+ dd = t->data[*b + t->sort_map[i]];
+
+ r = cell_data_compare(d, *a, dd, *b);
+ if (r != 0)
+ return t->reverse_map && t->reverse_map[t->sort_map[i]] ? -r : r;
+ }
+
+ /* Order identical lines by the order there were originally added in */
+ return CMP(*a, *b);
+}
+
+static char* format_strv_width(char **strv, size_t column_width) {
+ _cleanup_(memstream_done) MemStream m = {};
+ FILE *f;
+
+ f = memstream_init(&m);
+ if (!f)
+ return NULL;
+
+ size_t position = 0;
+ STRV_FOREACH(p, strv) {
+ size_t our_len = utf8_console_width(*p); /* This returns -1 on invalid utf-8 (which shouldn't happen).
+ * If that happens, we'll just print one item per line. */
+
+ if (position == 0) {
+ fputs(*p, f);
+ position = our_len;
+ } else if (size_add(size_add(position, 1), our_len) <= column_width) {
+ fprintf(f, " %s", *p);
+ position = size_add(size_add(position, 1), our_len);
+ } else {
+ fprintf(f, "\n%s", *p);
+ position = our_len;
+ }
+ }
+
+ char *buf;
+ if (memstream_finalize(&m, &buf, NULL) < 0)
+ return NULL;
+
+ return buf;
+}
+
+static const char *table_data_format(Table *t, TableData *d, bool avoid_uppercasing, size_t column_width, bool *have_soft) {
+ assert(d);
+
+ if (d->formatted &&
+ /* Only TABLE_STRV_WRAPPED adjust based on column_width so far… */
+ (d->type != TABLE_STRV_WRAPPED || d->formatted_for_width == column_width))
+ return d->formatted;
+
+ switch (d->type) {
+ case TABLE_EMPTY:
+ return table_ersatz_string(t);
+
+ case TABLE_STRING:
+ case TABLE_PATH:
+ case TABLE_PATH_BASENAME:
+ case TABLE_FIELD:
+ case TABLE_HEADER: {
+ _cleanup_free_ char *bn = NULL;
+ const char *s;
+
+ if (d->type == TABLE_PATH_BASENAME)
+ s = path_extract_filename(d->string, &bn) < 0 ? d->string : bn;
+ else
+ s = d->string;
+
+ if (d->uppercase && !avoid_uppercasing) {
+ d->formatted = new(char, strlen(s) + (d->type == TABLE_FIELD) + 1);
+ if (!d->formatted)
+ return NULL;
+
+ char *q = d->formatted;
+ for (const char *p = s; *p; p++)
+ *(q++) = (char) toupper((unsigned char) *p);
+
+ if (d->type == TABLE_FIELD)
+ *(q++) = ':';
+
+ *q = 0;
+ return d->formatted;
+ } else if (d->type == TABLE_FIELD) {
+ d->formatted = strjoin(s, ":");
+ if (!d->formatted)
+ return NULL;
+
+ return d->formatted;
+ }
+
+ if (bn) {
+ d->formatted = TAKE_PTR(bn);
+ return d->formatted;
+ }
+
+ return d->string;
+ }
+
+ case TABLE_STRV:
+ if (strv_isempty(d->strv))
+ return table_ersatz_string(t);
+
+ d->formatted = strv_join(d->strv, "\n");
+ if (!d->formatted)
+ return NULL;
+ break;
+
+ case TABLE_STRV_WRAPPED: {
+ if (strv_isempty(d->strv))
+ return table_ersatz_string(t);
+
+ char *buf = format_strv_width(d->strv, column_width);
+ if (!buf)
+ return NULL;
+
+ free_and_replace(d->formatted, buf);
+ d->formatted_for_width = column_width;
+ if (have_soft)
+ *have_soft = true;
+
+ break;
+ }
+
+ case TABLE_BOOLEAN:
+ return yes_no(d->boolean);
+
+ case TABLE_BOOLEAN_CHECKMARK:
+ return special_glyph(d->boolean ? SPECIAL_GLYPH_CHECK_MARK : SPECIAL_GLYPH_CROSS_MARK);
+
+ case TABLE_TIMESTAMP:
+ case TABLE_TIMESTAMP_UTC:
+ case TABLE_TIMESTAMP_RELATIVE:
+ case TABLE_TIMESTAMP_RELATIVE_MONOTONIC:
+ case TABLE_TIMESTAMP_LEFT:
+ case TABLE_TIMESTAMP_DATE: {
+ _cleanup_free_ char *p = NULL;
+ char *ret;
+
+ p = new(char,
+ IN_SET(d->type, TABLE_TIMESTAMP_RELATIVE, TABLE_TIMESTAMP_RELATIVE_MONOTONIC, TABLE_TIMESTAMP_LEFT) ?
+ FORMAT_TIMESTAMP_RELATIVE_MAX : FORMAT_TIMESTAMP_MAX);
+ if (!p)
+ return NULL;
+
+ if (d->type == TABLE_TIMESTAMP)
+ ret = format_timestamp(p, FORMAT_TIMESTAMP_MAX, d->timestamp);
+ else if (d->type == TABLE_TIMESTAMP_UTC)
+ ret = format_timestamp_style(p, FORMAT_TIMESTAMP_MAX, d->timestamp, TIMESTAMP_UTC);
+ else if (d->type == TABLE_TIMESTAMP_DATE)
+ ret = format_timestamp_style(p, FORMAT_TIMESTAMP_MAX, d->timestamp, TIMESTAMP_DATE);
+ else if (d->type == TABLE_TIMESTAMP_RELATIVE_MONOTONIC)
+ ret = format_timestamp_relative_monotonic(p, FORMAT_TIMESTAMP_RELATIVE_MAX, d->timestamp);
+ else
+ ret = format_timestamp_relative_full(p, FORMAT_TIMESTAMP_RELATIVE_MAX,
+ d->timestamp, CLOCK_REALTIME,
+ /* implicit_left = */ d->type == TABLE_TIMESTAMP_LEFT);
+ if (!ret)
+ return "-";
+
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_TIMESPAN:
+ case TABLE_TIMESPAN_MSEC:
+ case TABLE_TIMESPAN_DAY: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, FORMAT_TIMESPAN_MAX);
+ if (!p)
+ return NULL;
+
+ if (!format_timespan(p, FORMAT_TIMESPAN_MAX, d->timespan,
+ d->type == TABLE_TIMESPAN ? 0 :
+ d->type == TABLE_TIMESPAN_MSEC ? USEC_PER_MSEC : USEC_PER_DAY))
+ return "-";
+
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_SIZE: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, FORMAT_BYTES_MAX);
+ if (!p)
+ return NULL;
+
+ if (!format_bytes(p, FORMAT_BYTES_MAX, d->size))
+ return table_ersatz_string(t);
+
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_BPS: {
+ _cleanup_free_ char *p = NULL;
+ size_t n;
+
+ p = new(char, FORMAT_BYTES_MAX+2);
+ if (!p)
+ return NULL;
+
+ if (!format_bytes_full(p, FORMAT_BYTES_MAX, d->size, 0))
+ return table_ersatz_string(t);
+
+ n = strlen(p);
+ strscpy(p + n, FORMAT_BYTES_MAX + 2 - n, "bps");
+
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_INT: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->int_val) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%i", d->int_val);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_INT8: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->int8) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIi8, d->int8);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_INT16: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->int16) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIi16, d->int16);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_INT32: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->int32) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIi32, d->int32);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_INT64: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->int64) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIi64, d->int64);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->uint_val) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%u", d->uint_val);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT8: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->uint8) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIu8, d->uint8);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT16: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->uint16) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIu16, d->uint16);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT32: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->uint32) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIu32, d->uint32);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT32_HEX: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, 8 + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIx32, d->uint32);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT64: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->uint64) + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIu64, d->uint64);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_UINT64_HEX: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, 16 + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%" PRIx64, d->uint64);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_PERCENT: {
+ _cleanup_free_ char *p = NULL;
+
+ p = new(char, DECIMAL_STR_WIDTH(d->percent) + 2);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%i%%" , d->percent);
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_IFINDEX: {
+ _cleanup_free_ char *p = NULL;
+
+ if (format_ifname_full_alloc(d->ifindex, FORMAT_IFNAME_IFINDEX, &p) < 0)
+ return NULL;
+
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_IN_ADDR:
+ case TABLE_IN6_ADDR: {
+ _cleanup_free_ char *p = NULL;
+
+ if (in_addr_to_string(d->type == TABLE_IN_ADDR ? AF_INET : AF_INET6,
+ &d->address, &p) < 0)
+ return NULL;
+
+ d->formatted = TAKE_PTR(p);
+ break;
+ }
+
+ case TABLE_ID128: {
+ char *p;
+
+ p = new(char, SD_ID128_STRING_MAX);
+ if (!p)
+ return NULL;
+
+ d->formatted = sd_id128_to_string(d->id128, p);
+ break;
+ }
+
+ case TABLE_UUID: {
+ char *p;
+
+ p = new(char, SD_ID128_UUID_STRING_MAX);
+ if (!p)
+ return NULL;
+
+ d->formatted = sd_id128_to_uuid_string(d->id128, p);
+ break;
+ }
+
+ case TABLE_UID: {
+ char *p;
+
+ if (!uid_is_valid(d->uid))
+ return table_ersatz_string(t);
+
+ p = new(char, DECIMAL_STR_WIDTH(d->uid) + 1);
+ if (!p)
+ return NULL;
+ sprintf(p, UID_FMT, d->uid);
+
+ d->formatted = p;
+ break;
+ }
+
+ case TABLE_GID: {
+ char *p;
+
+ if (!gid_is_valid(d->gid))
+ return table_ersatz_string(t);
+
+ p = new(char, DECIMAL_STR_WIDTH(d->gid) + 1);
+ if (!p)
+ return NULL;
+ sprintf(p, GID_FMT, d->gid);
+
+ d->formatted = p;
+ break;
+ }
+
+ case TABLE_PID: {
+ char *p;
+
+ if (!pid_is_valid(d->pid))
+ return table_ersatz_string(t);
+
+ p = new(char, DECIMAL_STR_WIDTH(d->pid) + 1);
+ if (!p)
+ return NULL;
+ sprintf(p, PID_FMT, d->pid);
+
+ d->formatted = p;
+ break;
+ }
+
+ case TABLE_SIGNAL: {
+ const char *suffix;
+ char *p;
+
+ suffix = signal_to_string(d->int_val);
+ if (!suffix)
+ return table_ersatz_string(t);
+
+ p = strjoin("SIG", suffix);
+ if (!p)
+ return NULL;
+
+ d->formatted = p;
+ break;
+ }
+
+ case TABLE_MODE: {
+ char *p;
+
+ if (d->mode == MODE_INVALID)
+ return table_ersatz_string(t);
+
+ p = new(char, 4 + 1);
+ if (!p)
+ return NULL;
+
+ sprintf(p, "%04o", d->mode & 07777);
+ d->formatted = p;
+ break;
+ }
+
+ case TABLE_MODE_INODE_TYPE:
+
+ if (d->mode == MODE_INVALID)
+ return table_ersatz_string(t);
+
+ return inode_type_to_string(d->mode);
+
+ case TABLE_DEVNUM:
+ if (devnum_is_zero(d->devnum))
+ return table_ersatz_string(t);
+
+ if (asprintf(&d->formatted, DEVNUM_FORMAT_STR, DEVNUM_FORMAT_VAL(d->devnum)) < 0)
+ return NULL;
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return d->formatted;
+}
+
+static int console_width_height(
+ const char *s,
+ size_t *ret_width,
+ size_t *ret_height) {
+
+ size_t max_width = 0, height = 0;
+ const char *p;
+
+ assert(s);
+
+ /* Determine the width and height in console character cells the specified string needs. */
+
+ do {
+ size_t k;
+
+ p = strchr(s, '\n');
+ if (p) {
+ _cleanup_free_ char *c = NULL;
+
+ c = strndup(s, p - s);
+ if (!c)
+ return -ENOMEM;
+
+ k = utf8_console_width(c);
+ s = p + 1;
+ } else {
+ k = utf8_console_width(s);
+ s = NULL;
+ }
+ if (k == SIZE_MAX)
+ return -EINVAL;
+ if (k > max_width)
+ max_width = k;
+
+ height++;
+ } while (!isempty(s));
+
+ if (ret_width)
+ *ret_width = max_width;
+
+ if (ret_height)
+ *ret_height = height;
+
+ return 0;
+}
+
+static int table_data_requested_width_height(
+ Table *table,
+ TableData *d,
+ size_t available_width,
+ size_t *ret_width,
+ size_t *ret_height,
+ bool *have_soft) {
+
+ _cleanup_free_ char *truncated = NULL;
+ bool truncation_applied = false;
+ size_t width, height;
+ const char *t;
+ int r;
+ bool soft = false;
+
+ t = table_data_format(table, d, false, available_width, &soft);
+ if (!t)
+ return -ENOMEM;
+
+ if (table->cell_height_max != SIZE_MAX) {
+ r = string_truncate_lines(t, table->cell_height_max, &truncated);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ truncation_applied = true;
+
+ t = truncated;
+ }
+
+ r = console_width_height(t, &width, &height);
+ if (r < 0)
+ return r;
+
+ if (d->maximum_width != SIZE_MAX && width > d->maximum_width)
+ width = d->maximum_width;
+
+ if (width < d->minimum_width)
+ width = d->minimum_width;
+
+ if (ret_width)
+ *ret_width = width;
+ if (ret_height)
+ *ret_height = height;
+ if (have_soft && soft)
+ *have_soft = true;
+
+ return truncation_applied;
+}
+
+static char *align_string_mem(const char *str, const char *url, size_t new_length, unsigned percent) {
+ size_t w = 0, space, lspace, old_length, clickable_length;
+ _cleanup_free_ char *clickable = NULL;
+ const char *p;
+ char *ret;
+ int r;
+
+ /* As with ellipsize_mem(), 'old_length' is a byte size while 'new_length' is a width in character cells */
+
+ assert(str);
+ assert(percent <= 100);
+
+ old_length = strlen(str);
+
+ if (url) {
+ r = terminal_urlify(url, str, &clickable);
+ if (r < 0)
+ return NULL;
+
+ clickable_length = strlen(clickable);
+ } else
+ clickable_length = old_length;
+
+ /* Determine current width on screen */
+ p = str;
+ while (p < str + old_length) {
+ char32_t c;
+
+ if (utf8_encoded_to_unichar(p, &c) < 0) {
+ p++, w++; /* count invalid chars as 1 */
+ continue;
+ }
+
+ p = utf8_next_char(p);
+ w += unichar_iswide(c) ? 2 : 1;
+ }
+
+ /* Already wider than the target, if so, don't do anything */
+ if (w >= new_length)
+ return clickable ? TAKE_PTR(clickable) : strdup(str);
+
+ /* How much spaces shall we add? An how much on the left side? */
+ space = new_length - w;
+ lspace = space * percent / 100U;
+
+ ret = new(char, space + clickable_length + 1);
+ if (!ret)
+ return NULL;
+
+ for (size_t i = 0; i < lspace; i++)
+ ret[i] = ' ';
+ memcpy(ret + lspace, clickable ?: str, clickable_length);
+ for (size_t i = lspace + clickable_length; i < space + clickable_length; i++)
+ ret[i] = ' ';
+
+ ret[space + clickable_length] = 0;
+ return ret;
+}
+
+static bool table_data_isempty(TableData *d) {
+ assert(d);
+
+ if (d->type == TABLE_EMPTY)
+ return true;
+
+ /* Let's also consider an empty strv as truly empty. */
+ if (IN_SET(d->type, TABLE_STRV, TABLE_STRV_WRAPPED))
+ return strv_isempty(d->strv);
+
+ /* Note that an empty string we do not consider empty here! */
+ return false;
+}
+
+static const char* table_data_color(TableData *d) {
+ assert(d);
+
+ if (d->color)
+ return d->color;
+
+ /* Let's implicitly color all "empty" cells in grey, in case an "empty_string" is set that is not empty */
+ if (table_data_isempty(d))
+ return ansi_grey();
+
+ if (d->type == TABLE_FIELD)
+ return ansi_bright_blue();
+ if (d->type == TABLE_HEADER)
+ return ansi_underline();
+
+ return NULL;
+}
+
+static const char* table_data_rgap_color(TableData *d) {
+ assert(d);
+
+ if (d->rgap_color)
+ return d->rgap_color;
+
+ if (d->type == TABLE_HEADER)
+ return ansi_underline();
+
+ return NULL;
+}
+
+int table_print(Table *t, FILE *f) {
+ size_t n_rows, *minimum_width, *maximum_width, display_columns, *requested_width,
+ table_minimum_width, table_maximum_width, table_requested_width, table_effective_width,
+ *width = NULL;
+ _cleanup_free_ size_t *sorted = NULL;
+ uint64_t *column_weight, weight_sum;
+ int r;
+
+ assert(t);
+
+ if (!f)
+ f = stdout;
+
+ /* Ensure we have no incomplete rows */
+ assert(t->n_cells % t->n_columns == 0);
+
+ n_rows = t->n_cells / t->n_columns;
+ assert(n_rows > 0); /* at least the header row must be complete */
+
+ if (t->sort_map) {
+ /* If sorting is requested, let's calculate an index table we use to lookup the actual index to display with. */
+
+ sorted = new(size_t, n_rows);
+ if (!sorted)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < n_rows; i++)
+ sorted[i] = i * t->n_columns;
+
+ typesafe_qsort_r(sorted, n_rows, table_data_compare, t);
+ }
+
+ if (t->display_map)
+ display_columns = t->n_display_map;
+ else
+ display_columns = t->n_columns;
+
+ assert(display_columns > 0);
+
+ minimum_width = newa(size_t, display_columns);
+ maximum_width = newa(size_t, display_columns);
+ requested_width = newa(size_t, display_columns);
+ column_weight = newa0(uint64_t, display_columns);
+
+ for (size_t j = 0; j < display_columns; j++) {
+ minimum_width[j] = 1;
+ maximum_width[j] = SIZE_MAX;
+ }
+
+ for (unsigned pass = 0; pass < 2; pass++) {
+ /* First pass: determine column sizes */
+
+ for (size_t j = 0; j < display_columns; j++)
+ requested_width[j] = SIZE_MAX;
+
+ bool any_soft = false;
+
+ for (size_t i = t->header ? 0 : 1; i < n_rows; i++) {
+ TableData **row;
+
+ /* Note that we don't care about ordering at this time, as we just want to determine column sizes,
+ * hence we don't care for sorted[] during the first pass. */
+ row = t->data + i * t->n_columns;
+
+ for (size_t j = 0; j < display_columns; j++) {
+ TableData *d;
+ size_t req_width, req_height;
+
+ assert_se(d = row[t->display_map ? t->display_map[j] : j]);
+
+ r = table_data_requested_width_height(t, d,
+ width ? width[j] : SIZE_MAX,
+ &req_width, &req_height, &any_soft);
+ if (r < 0)
+ return r;
+ if (r > 0) { /* Truncated because too many lines? */
+ _cleanup_free_ char *last = NULL;
+ const char *field;
+
+ /* If we are going to show only the first few lines of a cell that has
+ * multiple make sure that we have enough space horizontally to show an
+ * ellipsis. Hence, let's figure out the last line, and account for its
+ * length plus ellipsis. */
+
+ field = table_data_format(t, d, false,
+ width ? width[j] : SIZE_MAX,
+ &any_soft);
+ if (!field)
+ return -ENOMEM;
+
+ assert_se(t->cell_height_max > 0);
+ r = string_extract_line(field, t->cell_height_max-1, &last);
+ if (r < 0)
+ return r;
+
+ req_width = MAX(req_width,
+ utf8_console_width(last) +
+ utf8_console_width(special_glyph(SPECIAL_GLYPH_ELLIPSIS)));
+ }
+
+ /* Determine the biggest width that any cell in this column would like to have */
+ if (requested_width[j] == SIZE_MAX ||
+ requested_width[j] < req_width)
+ requested_width[j] = req_width;
+
+ /* Determine the minimum width any cell in this column needs */
+ if (minimum_width[j] < d->minimum_width)
+ minimum_width[j] = d->minimum_width;
+
+ /* Determine the maximum width any cell in this column needs */
+ if (d->maximum_width != SIZE_MAX &&
+ (maximum_width[j] == SIZE_MAX ||
+ maximum_width[j] > d->maximum_width))
+ maximum_width[j] = d->maximum_width;
+
+ /* Determine the full columns weight */
+ column_weight[j] += d->weight;
+ }
+ }
+
+ /* One space between each column */
+ table_requested_width = table_minimum_width = table_maximum_width = display_columns - 1;
+
+ /* Calculate the total weight for all columns, plus the minimum, maximum and requested width for the table. */
+ weight_sum = 0;
+ for (size_t j = 0; j < display_columns; j++) {
+ weight_sum += column_weight[j];
+
+ table_minimum_width += minimum_width[j];
+
+ if (maximum_width[j] == SIZE_MAX)
+ table_maximum_width = SIZE_MAX;
+ else
+ table_maximum_width += maximum_width[j];
+
+ table_requested_width += requested_width[j];
+ }
+
+ /* Calculate effective table width */
+ if (t->width != 0 && t->width != SIZE_MAX)
+ table_effective_width = t->width;
+ else if (t->width == 0 ||
+ ((pass > 0 || !any_soft) && (pager_have() || !isatty(STDOUT_FILENO))))
+ table_effective_width = table_requested_width;
+ else
+ table_effective_width = MIN(table_requested_width, columns());
+
+ if (table_maximum_width != SIZE_MAX && table_effective_width > table_maximum_width)
+ table_effective_width = table_maximum_width;
+
+ if (table_effective_width < table_minimum_width)
+ table_effective_width = table_minimum_width;
+
+ if (!width)
+ width = newa(size_t, display_columns);
+
+ if (table_effective_width >= table_requested_width) {
+ size_t extra;
+
+ /* We have extra room, let's distribute it among columns according to their weights. We first provide
+ * each column with what it asked for and the distribute the rest. */
+
+ extra = table_effective_width - table_requested_width;
+
+ for (size_t j = 0; j < display_columns; j++) {
+ size_t delta;
+
+ if (weight_sum == 0)
+ width[j] = requested_width[j] + extra / (display_columns - j); /* Avoid division by zero */
+ else
+ width[j] = requested_width[j] + (extra * column_weight[j]) / weight_sum;
+
+ if (maximum_width[j] != SIZE_MAX && width[j] > maximum_width[j])
+ width[j] = maximum_width[j];
+
+ if (width[j] < minimum_width[j])
+ width[j] = minimum_width[j];
+
+ delta = LESS_BY(width[j], requested_width[j]);
+
+ /* Subtract what we just added from the rest */
+ if (extra > delta)
+ extra -= delta;
+ else
+ extra = 0;
+
+ assert(weight_sum >= column_weight[j]);
+ weight_sum -= column_weight[j];
+ }
+
+ break; /* Every column should be happy, no need to repeat calculations. */
+ } else {
+ /* We need to compress the table, columns can't get what they asked for. We first provide each column
+ * with the minimum they need, and then distribute anything left. */
+ bool finalize = false;
+ size_t extra;
+
+ extra = table_effective_width - table_minimum_width;
+
+ for (size_t j = 0; j < display_columns; j++)
+ width[j] = SIZE_MAX;
+
+ for (;;) {
+ bool restart = false;
+
+ for (size_t j = 0; j < display_columns; j++) {
+ size_t delta, w;
+
+ /* Did this column already get something assigned? If so, let's skip to the next */
+ if (width[j] != SIZE_MAX)
+ continue;
+
+ if (weight_sum == 0)
+ w = minimum_width[j] + extra / (display_columns - j); /* avoid division by zero */
+ else
+ w = minimum_width[j] + (extra * column_weight[j]) / weight_sum;
+
+ if (w >= requested_width[j]) {
+ /* Never give more than requested. If we hit a column like this, there's more
+ * space to allocate to other columns which means we need to restart the
+ * iteration. However, if we hit a column like this, let's assign it the space
+ * it wanted for good early. */
+
+ w = requested_width[j];
+ restart = true;
+
+ } else if (!finalize)
+ continue;
+
+ width[j] = w;
+
+ assert(w >= minimum_width[j]);
+ delta = w - minimum_width[j];
+
+ assert(delta <= extra);
+ extra -= delta;
+
+ assert(weight_sum >= column_weight[j]);
+ weight_sum -= column_weight[j];
+
+ if (restart && !finalize)
+ break;
+ }
+
+ if (finalize)
+ break;
+
+ if (!restart)
+ finalize = true;
+ }
+
+ if (!any_soft) /* Some columns got less than requested. If some cells were "soft",
+ * let's try to reformat them with the new widths. Otherwise, let's
+ * move on. */
+ break;
+ }
+ }
+
+ /* Second pass: show output */
+ for (size_t i = t->header ? 0 : 1; i < n_rows; i++) {
+ size_t n_subline = 0;
+ bool more_sublines;
+ TableData **row;
+
+ if (sorted)
+ row = t->data + sorted[i];
+ else
+ row = t->data + i * t->n_columns;
+
+ do {
+ const char *gap_color = NULL;
+ more_sublines = false;
+
+ for (size_t j = 0; j < display_columns; j++) {
+ _cleanup_free_ char *buffer = NULL, *extracted = NULL;
+ bool lines_truncated = false;
+ const char *field, *color = NULL;
+ TableData *d;
+ size_t l;
+
+ assert_se(d = row[t->display_map ? t->display_map[j] : j]);
+
+ field = table_data_format(t, d, false, width[j], NULL);
+ if (!field)
+ return -ENOMEM;
+
+ r = string_extract_line(field, n_subline, &extracted);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ /* There are more lines to come */
+ if ((t->cell_height_max == SIZE_MAX || n_subline + 1 < t->cell_height_max))
+ more_sublines = true; /* There are more lines to come */
+ else
+ lines_truncated = true;
+ }
+ if (extracted)
+ field = extracted;
+
+ l = utf8_console_width(field);
+ if (l > width[j]) {
+ /* Field is wider than allocated space. Let's ellipsize */
+
+ buffer = ellipsize(field, width[j], /* ellipsize at the end if we truncated coming lines, otherwise honour configuration */
+ lines_truncated ? 100 : d->ellipsize_percent);
+ if (!buffer)
+ return -ENOMEM;
+
+ field = buffer;
+ } else {
+ if (lines_truncated) {
+ _cleanup_free_ char *padded = NULL;
+
+ /* We truncated more lines of this cell, let's add an
+ * ellipsis. We first append it, but that might make our
+ * string grow above what we have space for, hence ellipsize
+ * right after. This will truncate the ellipsis and add a new
+ * one. */
+
+ padded = strjoin(field, special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+ if (!padded)
+ return -ENOMEM;
+
+ buffer = ellipsize(padded, width[j], 100);
+ if (!buffer)
+ return -ENOMEM;
+
+ field = buffer;
+ l = utf8_console_width(field);
+ }
+
+ if (l < width[j]) {
+ _cleanup_free_ char *aligned = NULL;
+ /* Field is shorter than allocated space. Let's align with spaces */
+
+ aligned = align_string_mem(field, d->url, width[j], d->align_percent);
+ if (!aligned)
+ return -ENOMEM;
+
+ /* Drop trailing white spaces of last column when no cosmetics is set. */
+ if (j == display_columns - 1 &&
+ (!colors_enabled() || !table_data_color(d)) &&
+ (!urlify_enabled() || !d->url))
+ delete_trailing_chars(aligned, NULL);
+
+ free_and_replace(buffer, aligned);
+ field = buffer;
+ }
+ }
+
+ if (l >= width[j] && d->url) {
+ _cleanup_free_ char *clickable = NULL;
+
+ r = terminal_urlify(d->url, field, &clickable);
+ if (r < 0)
+ return r;
+
+ free_and_replace(buffer, clickable);
+ field = buffer;
+ }
+
+ if (colors_enabled() && gap_color)
+ fputs(gap_color, f);
+
+ if (j > 0)
+ fputc(' ', f); /* column separator left of cell */
+
+ if (colors_enabled()) {
+ color = table_data_color(d);
+
+ /* Undo gap color */
+ if (gap_color)
+ fputs(ANSI_NORMAL, f);
+
+ if (color)
+ fputs(color, f);
+ }
+
+ fputs(field, f);
+
+ if (colors_enabled() && color)
+ fputs(ANSI_NORMAL, f);
+
+ gap_color = table_data_rgap_color(d);
+ }
+
+ fputc('\n', f);
+ n_subline ++;
+ } while (more_sublines);
+ }
+
+ return fflush_and_check(f);
+}
+
+int table_format(Table *t, char **ret) {
+ _cleanup_(memstream_done) MemStream m = {};
+ FILE *f;
+ int r;
+
+ assert(t);
+ assert(ret);
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ r = table_print(t, f);
+ if (r < 0)
+ return r;
+
+ return memstream_finalize(&m, ret, NULL);
+}
+
+size_t table_get_rows(Table *t) {
+ if (!t)
+ return 0;
+
+ assert(t->n_columns > 0);
+ return t->n_cells / t->n_columns;
+}
+
+size_t table_get_columns(Table *t) {
+ if (!t)
+ return 0;
+
+ assert(t->n_columns > 0);
+ return t->n_columns;
+}
+
+size_t table_get_current_column(Table *t) {
+ if (!t)
+ return 0;
+
+ assert(t->n_columns > 0);
+ return t->n_cells % t->n_columns;
+}
+
+int table_set_reverse(Table *t, size_t column, bool b) {
+ assert(t);
+ assert(column < t->n_columns);
+
+ if (!t->reverse_map) {
+ if (!b)
+ return 0;
+
+ t->reverse_map = new0(bool, t->n_columns);
+ if (!t->reverse_map)
+ return -ENOMEM;
+ }
+
+ t->reverse_map[column] = b;
+ return 0;
+}
+
+TableCell *table_get_cell(Table *t, size_t row, size_t column) {
+ size_t i;
+
+ assert(t);
+
+ if (column >= t->n_columns)
+ return NULL;
+
+ i = row * t->n_columns + column;
+ if (i >= t->n_cells)
+ return NULL;
+
+ return TABLE_INDEX_TO_CELL(i);
+}
+
+const void *table_get(Table *t, TableCell *cell) {
+ TableData *d;
+
+ assert(t);
+
+ d = table_get_data(t, cell);
+ if (!d)
+ return NULL;
+
+ return d->data;
+}
+
+const void* table_get_at(Table *t, size_t row, size_t column) {
+ TableCell *cell;
+
+ cell = table_get_cell(t, row, column);
+ if (!cell)
+ return NULL;
+
+ return table_get(t, cell);
+}
+
+static int table_data_to_json(TableData *d, JsonVariant **ret) {
+
+ switch (d->type) {
+
+ case TABLE_EMPTY:
+ return json_variant_new_null(ret);
+
+ case TABLE_STRING:
+ case TABLE_PATH:
+ case TABLE_PATH_BASENAME:
+ case TABLE_FIELD:
+ case TABLE_HEADER:
+ return json_variant_new_string(ret, d->string);
+
+ case TABLE_STRV:
+ case TABLE_STRV_WRAPPED:
+ return json_variant_new_array_strv(ret, d->strv);
+
+ case TABLE_BOOLEAN_CHECKMARK:
+ case TABLE_BOOLEAN:
+ return json_variant_new_boolean(ret, d->boolean);
+
+ case TABLE_TIMESTAMP:
+ case TABLE_TIMESTAMP_UTC:
+ case TABLE_TIMESTAMP_RELATIVE:
+ case TABLE_TIMESTAMP_RELATIVE_MONOTONIC:
+ case TABLE_TIMESTAMP_LEFT:
+ case TABLE_TIMESTAMP_DATE:
+ if (d->timestamp == USEC_INFINITY)
+ return json_variant_new_null(ret);
+
+ return json_variant_new_unsigned(ret, d->timestamp);
+
+ case TABLE_TIMESPAN:
+ case TABLE_TIMESPAN_MSEC:
+ case TABLE_TIMESPAN_DAY:
+ if (d->timespan == USEC_INFINITY)
+ return json_variant_new_null(ret);
+
+ return json_variant_new_unsigned(ret, d->timespan);
+
+ case TABLE_SIZE:
+ case TABLE_BPS:
+ if (d->size == UINT64_MAX)
+ return json_variant_new_null(ret);
+
+ return json_variant_new_unsigned(ret, d->size);
+
+ case TABLE_INT:
+ return json_variant_new_integer(ret, d->int_val);
+
+ case TABLE_INT8:
+ return json_variant_new_integer(ret, d->int8);
+
+ case TABLE_INT16:
+ return json_variant_new_integer(ret, d->int16);
+
+ case TABLE_INT32:
+ return json_variant_new_integer(ret, d->int32);
+
+ case TABLE_INT64:
+ return json_variant_new_integer(ret, d->int64);
+
+ case TABLE_UINT:
+ return json_variant_new_unsigned(ret, d->uint_val);
+
+ case TABLE_UINT8:
+ return json_variant_new_unsigned(ret, d->uint8);
+
+ case TABLE_UINT16:
+ return json_variant_new_unsigned(ret, d->uint16);
+
+ case TABLE_UINT32:
+ case TABLE_UINT32_HEX:
+ return json_variant_new_unsigned(ret, d->uint32);
+
+ case TABLE_UINT64:
+ case TABLE_UINT64_HEX:
+ return json_variant_new_unsigned(ret, d->uint64);
+
+ case TABLE_PERCENT:
+ return json_variant_new_integer(ret, d->percent);
+
+ case TABLE_IFINDEX:
+ if (d->ifindex <= 0)
+ return json_variant_new_null(ret);
+
+ return json_variant_new_integer(ret, d->ifindex);
+
+ case TABLE_IN_ADDR:
+ return json_variant_new_array_bytes(ret, &d->address, FAMILY_ADDRESS_SIZE(AF_INET));
+
+ case TABLE_IN6_ADDR:
+ return json_variant_new_array_bytes(ret, &d->address, FAMILY_ADDRESS_SIZE(AF_INET6));
+
+ case TABLE_ID128:
+ return json_variant_new_id128(ret, d->id128);
+
+ case TABLE_UUID:
+ return json_variant_new_uuid(ret, d->id128);
+
+ case TABLE_UID:
+ if (!uid_is_valid(d->uid))
+ return json_variant_new_null(ret);
+
+ return json_variant_new_integer(ret, d->uid);
+
+ case TABLE_GID:
+ if (!gid_is_valid(d->gid))
+ return json_variant_new_null(ret);
+
+ return json_variant_new_integer(ret, d->gid);
+
+ case TABLE_PID:
+ if (!pid_is_valid(d->pid))
+ return json_variant_new_null(ret);
+
+ return json_variant_new_integer(ret, d->pid);
+
+ case TABLE_SIGNAL:
+ if (!SIGNAL_VALID(d->int_val))
+ return json_variant_new_null(ret);
+
+ return json_variant_new_integer(ret, d->int_val);
+
+ case TABLE_MODE:
+ case TABLE_MODE_INODE_TYPE:
+ if (d->mode == MODE_INVALID)
+ return json_variant_new_null(ret);
+
+ return json_variant_new_unsigned(ret, d->mode);
+
+ case TABLE_DEVNUM:
+ if (devnum_is_zero(d->devnum))
+ return json_variant_new_null(ret);
+
+ return json_build(ret, JSON_BUILD_ARRAY(
+ JSON_BUILD_UNSIGNED(major(d->devnum)),
+ JSON_BUILD_UNSIGNED(minor(d->devnum))));
+
+ default:
+ return -EINVAL;
+ }
+}
+
+static char* string_to_json_field_name(const char *f) {
+ /* Tries to make a string more suitable as JSON field name. There are no strict rules defined what a
+ * field name can be hence this is a bit vague and black magic. Right now we only convert spaces to
+ * underscores and leave everything as is. */
+
+ char *c = strdup(f);
+ if (!c)
+ return NULL;
+
+ for (char *x = c; *x; x++)
+ if (isspace(*x))
+ *x = '_';
+
+ return c;
+}
+
+static int table_make_json_field_name(Table *t, TableData *d, char **ret) {
+ _cleanup_free_ char *mangled = NULL;
+ const char *n;
+
+ assert(t);
+ assert(d);
+ assert(ret);
+
+ if (IN_SET(d->type, TABLE_HEADER, TABLE_FIELD))
+ n = d->string;
+ else {
+ n = table_data_format(t, d, /* avoid_uppercasing= */ true, SIZE_MAX, NULL);
+ if (!n)
+ return -ENOMEM;
+ }
+
+ mangled = string_to_json_field_name(n);
+ if (!mangled)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(mangled);
+ return 0;
+}
+
+static const char *table_get_json_field_name(Table *t, size_t idx) {
+ assert(t);
+
+ return idx < t->n_json_fields ? t->json_fields[idx] : NULL;
+}
+
+static int table_to_json_regular(Table *t, JsonVariant **ret) {
+ JsonVariant **rows = NULL, **elements = NULL;
+ _cleanup_free_ size_t *sorted = NULL;
+ size_t n_rows, display_columns;
+ int r;
+
+ assert(t);
+ assert(!t->vertical);
+
+ /* Ensure we have no incomplete rows */
+ assert(t->n_columns > 0);
+ assert(t->n_cells % t->n_columns == 0);
+
+ n_rows = t->n_cells / t->n_columns;
+ assert(n_rows > 0); /* at least the header row must be complete */
+
+ if (t->sort_map) {
+ /* If sorting is requested, let's calculate an index table we use to lookup the actual index to display with. */
+
+ sorted = new(size_t, n_rows);
+ if (!sorted)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < n_rows; i++)
+ sorted[i] = i * t->n_columns;
+
+ typesafe_qsort_r(sorted, n_rows, table_data_compare, t);
+ }
+
+ if (t->display_map)
+ display_columns = t->n_display_map;
+ else
+ display_columns = t->n_columns;
+ assert(display_columns > 0);
+
+ elements = new0(JsonVariant*, display_columns * 2);
+ if (!elements)
+ return -ENOMEM;
+
+ CLEANUP_ARRAY(elements, (size_t) { display_columns * 2 }, json_variant_unref_many);
+
+ for (size_t j = 0; j < display_columns; j++) {
+ _cleanup_free_ char *mangled = NULL;
+ const char *n;
+ size_t c;
+
+ c = t->display_map ? t->display_map[j] : j;
+
+ /* Use explicitly set JSON field name, if we have one. Otherwise mangle the column field value. */
+ n = table_get_json_field_name(t, c);
+ if (!n) {
+ r = table_make_json_field_name(t, ASSERT_PTR(t->data[c]), &mangled);
+ if (r < 0)
+ return r;
+
+ n = mangled;
+ }
+
+ r = json_variant_new_string(elements + j*2, n);
+ if (r < 0)
+ return r;
+ }
+
+ rows = new0(JsonVariant*, n_rows-1);
+ if (!rows)
+ return -ENOMEM;
+
+ CLEANUP_ARRAY(rows, (size_t) { n_rows - 1 }, json_variant_unref_many);
+
+ for (size_t i = 1; i < n_rows; i++) {
+ TableData **row;
+
+ if (sorted)
+ row = t->data + sorted[i];
+ else
+ row = t->data + i * t->n_columns;
+
+ for (size_t j = 0; j < display_columns; j++) {
+ TableData *d;
+ size_t k;
+
+ assert_se(d = row[t->display_map ? t->display_map[j] : j]);
+
+ k = j*2+1;
+ elements[k] = json_variant_unref(elements[k]);
+
+ r = table_data_to_json(d, elements + k);
+ if (r < 0)
+ return r;
+ }
+
+ r = json_variant_new_object(rows + i - 1, elements, display_columns * 2);
+ if (r < 0)
+ return r;
+ }
+
+ return json_variant_new_array(ret, rows, n_rows - 1);
+}
+
+static int table_to_json_vertical(Table *t, JsonVariant **ret) {
+ JsonVariant **elements = NULL;
+ size_t n_elements = 0;
+ int r;
+
+ assert(t);
+ assert(t->vertical);
+
+ if (t->n_columns != 2)
+ return -EINVAL;
+
+ /* Ensure we have no incomplete rows */
+ assert(t->n_cells % t->n_columns == 0);
+
+ elements = new0(JsonVariant *, t->n_cells);
+ if (!elements)
+ return -ENOMEM;
+
+ CLEANUP_ARRAY(elements, n_elements, json_variant_unref_many);
+
+ for (size_t i = t->n_columns; i < t->n_cells; i++) {
+
+ if (i % t->n_columns == 0) {
+ _cleanup_free_ char *mangled = NULL;
+ const char *n;
+
+ n = table_get_json_field_name(t, i / t->n_columns - 1);
+ if (!n) {
+ r = table_make_json_field_name(t, ASSERT_PTR(t->data[i]), &mangled);
+ if (r < 0)
+ return r;
+
+ n = mangled;
+ }
+
+ r = json_variant_new_string(elements + n_elements, n);
+ } else
+ r = table_data_to_json(t->data[i], elements + n_elements);
+ if (r < 0)
+ return r;
+
+ n_elements++;
+ }
+
+ return json_variant_new_object(ret, elements, n_elements);
+}
+
+int table_to_json(Table *t, JsonVariant **ret) {
+ assert(t);
+
+ if (t->vertical)
+ return table_to_json_vertical(t, ret);
+
+ return table_to_json_regular(t, ret);
+}
+
+int table_print_json(Table *t, FILE *f, JsonFormatFlags flags) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ int r;
+
+ assert(t);
+
+ if (flags & JSON_FORMAT_OFF) /* If JSON output is turned off, use regular output */
+ return table_print(t, f);
+
+ if (!f)
+ f = stdout;
+
+ r = table_to_json(t, &v);
+ if (r < 0)
+ return r;
+
+ json_variant_dump(v, flags, f, NULL);
+
+ return fflush_and_check(f);
+}
+
+int table_print_with_pager(
+ Table *t,
+ JsonFormatFlags json_format_flags,
+ PagerFlags pager_flags,
+ bool show_header) {
+
+ bool saved_header;
+ int r;
+
+ assert(t);
+
+ /* An all-in-one solution for showing tables, and turning on a pager first. Also optionally suppresses
+ * the table header and logs about any error. */
+
+ if (json_format_flags & (JSON_FORMAT_OFF|JSON_FORMAT_PRETTY|JSON_FORMAT_PRETTY_AUTO))
+ pager_open(pager_flags);
+
+ saved_header = t->header;
+ t->header = show_header;
+ r = table_print_json(t, stdout, json_format_flags);
+ t->header = saved_header;
+ if (r < 0)
+ return table_log_print_error(r);
+
+ return 0;
+}
+
+int table_set_json_field_name(Table *t, size_t idx, const char *name) {
+ int r;
+
+ assert(t);
+
+ if (name) {
+ size_t m;
+
+ m = MAX(idx + 1, t->n_json_fields);
+ if (!GREEDY_REALLOC0(t->json_fields, m))
+ return -ENOMEM;
+
+ r = free_and_strdup(t->json_fields + idx, name);
+ if (r < 0)
+ return r;
+
+ t->n_json_fields = m;
+ return r;
+ } else {
+ if (idx >= t->n_json_fields)
+ return 0;
+
+ t->json_fields[idx] = mfree(t->json_fields[idx]);
+ return 1;
+ }
+}
diff --git a/src/shared/format-table.h b/src/shared/format-table.h
new file mode 100644
index 0000000..37bfbca
--- /dev/null
+++ b/src/shared/format-table.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include "json.h"
+#include "macro.h"
+#include "pager.h"
+
+typedef enum TableDataType {
+ TABLE_EMPTY,
+ TABLE_STRING,
+ TABLE_HEADER, /* in regular mode: the cells in the first row, that carry the column names */
+ TABLE_FIELD, /* in vertical mode: the cells in the first column, that carry the field names */
+ TABLE_STRV,
+ TABLE_STRV_WRAPPED,
+ TABLE_PATH,
+ TABLE_PATH_BASENAME, /* like TABLE_PATH, but display only last path element (i.e. the "basename") in regular output */
+ TABLE_BOOLEAN,
+ TABLE_BOOLEAN_CHECKMARK,
+ TABLE_TIMESTAMP,
+ TABLE_TIMESTAMP_UTC,
+ TABLE_TIMESTAMP_RELATIVE,
+ TABLE_TIMESTAMP_RELATIVE_MONOTONIC,
+ TABLE_TIMESTAMP_LEFT,
+ TABLE_TIMESTAMP_DATE,
+ TABLE_TIMESPAN,
+ TABLE_TIMESPAN_MSEC,
+ TABLE_TIMESPAN_DAY,
+ TABLE_SIZE,
+ TABLE_BPS,
+ TABLE_INT,
+ TABLE_INT8,
+ TABLE_INT16,
+ TABLE_INT32,
+ TABLE_INT64,
+ TABLE_UINT,
+ TABLE_UINT8,
+ TABLE_UINT16,
+ TABLE_UINT32,
+ TABLE_UINT32_HEX,
+ TABLE_UINT64,
+ TABLE_UINT64_HEX,
+ TABLE_PERCENT,
+ TABLE_IFINDEX,
+ TABLE_IN_ADDR, /* Takes a union in_addr_union (or a struct in_addr) */
+ TABLE_IN6_ADDR, /* Takes a union in_addr_union (or a struct in6_addr) */
+ TABLE_ID128,
+ TABLE_UUID,
+ TABLE_UID,
+ TABLE_GID,
+ TABLE_PID,
+ TABLE_SIGNAL,
+ TABLE_MODE, /* as in UNIX file mode (mode_t), in typical octal output */
+ TABLE_MODE_INODE_TYPE, /* also mode_t, but displays only the inode type as string */
+ TABLE_DEVNUM, /* a dev_t, displayed in the usual major:minor way */
+ _TABLE_DATA_TYPE_MAX,
+
+ /* The following are not really data types, but commands for table_add_cell_many() to make changes to
+ * a cell just added. */
+ TABLE_SET_MINIMUM_WIDTH,
+ TABLE_SET_MAXIMUM_WIDTH,
+ TABLE_SET_WEIGHT,
+ TABLE_SET_ALIGN_PERCENT,
+ TABLE_SET_ELLIPSIZE_PERCENT,
+ TABLE_SET_COLOR,
+ TABLE_SET_RGAP_COLOR,
+ TABLE_SET_BOTH_COLORS,
+ TABLE_SET_URL,
+ TABLE_SET_UPPERCASE,
+
+ _TABLE_DATA_TYPE_INVALID = -EINVAL,
+} TableDataType;
+
+typedef enum TableErsatz {
+ TABLE_ERSATZ_EMPTY,
+ TABLE_ERSATZ_DASH,
+ TABLE_ERSATZ_UNSET,
+ TABLE_ERSATZ_NA,
+ _TABLE_ERSATZ_MAX,
+} TableErsatz;
+
+typedef struct Table Table;
+typedef struct TableCell TableCell;
+
+Table *table_new_internal(const char *first_header, ...) _sentinel_;
+#define table_new(...) table_new_internal(__VA_ARGS__, NULL)
+Table *table_new_raw(size_t n_columns);
+Table *table_new_vertical(void);
+Table *table_unref(Table *t);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Table*, table_unref);
+
+int table_add_cell_full(Table *t, TableCell **ret_cell, TableDataType type, const void *data, size_t minimum_width, size_t maximum_width, unsigned weight, unsigned align_percent, unsigned ellipsize_percent);
+static inline int table_add_cell(Table *t, TableCell **ret_cell, TableDataType type, const void *data) {
+ return table_add_cell_full(t, ret_cell, type, data, SIZE_MAX, SIZE_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+}
+int table_add_cell_stringf_full(Table *t, TableCell **ret_cell, TableDataType type, const char *format, ...) _printf_(4, 5);
+#define table_add_cell_stringf(t, ret_cell, format, ...) table_add_cell_stringf_full(t, ret_cell, TABLE_STRING, format, __VA_ARGS__)
+
+int table_fill_empty(Table *t, size_t until_column);
+
+int table_dup_cell(Table *t, TableCell *cell);
+
+int table_set_minimum_width(Table *t, TableCell *cell, size_t minimum_width);
+int table_set_maximum_width(Table *t, TableCell *cell, size_t maximum_width);
+int table_set_weight(Table *t, TableCell *cell, unsigned weight);
+int table_set_align_percent(Table *t, TableCell *cell, unsigned percent);
+int table_set_ellipsize_percent(Table *t, TableCell *cell, unsigned percent);
+int table_set_color(Table *t, TableCell *cell, const char *color);
+int table_set_rgap_color(Table *t, TableCell *cell, const char *color);
+int table_set_url(Table *t, TableCell *cell, const char *url);
+int table_set_uppercase(Table *t, TableCell *cell, bool b);
+
+int table_update(Table *t, TableCell *cell, TableDataType type, const void *data);
+
+int table_add_many_internal(Table *t, TableDataType first_type, ...);
+#define table_add_many(t, ...) table_add_many_internal(t, __VA_ARGS__, _TABLE_DATA_TYPE_MAX)
+
+void table_set_header(Table *table, bool b);
+void table_set_width(Table *t, size_t width);
+void table_set_cell_height_max(Table *t, size_t height);
+void table_set_ersatz_string(Table *t, TableErsatz ersatz);
+int table_set_display_internal(Table *t, size_t first_column, ...);
+#define table_set_display(...) table_set_display_internal(__VA_ARGS__, SIZE_MAX)
+int table_set_sort_internal(Table *t, size_t first_column, ...);
+#define table_set_sort(...) table_set_sort_internal(__VA_ARGS__, SIZE_MAX)
+int table_set_reverse(Table *t, size_t column, bool b);
+int table_hide_column_from_display_internal(Table *t, ...);
+#define table_hide_column_from_display(t, ...) table_hide_column_from_display_internal(t, __VA_ARGS__, (size_t) -1)
+
+int table_print(Table *t, FILE *f);
+int table_format(Table *t, char **ret);
+
+static inline TableCell* TABLE_HEADER_CELL(size_t i) {
+ return SIZE_TO_PTR(i + 1);
+}
+
+size_t table_get_rows(Table *t);
+size_t table_get_columns(Table *t);
+
+size_t table_get_current_column(Table *t);
+
+TableCell *table_get_cell(Table *t, size_t row, size_t column);
+
+const void *table_get(Table *t, TableCell *cell);
+const void *table_get_at(Table *t, size_t row, size_t column);
+
+int table_to_json(Table *t, JsonVariant **ret);
+int table_print_json(Table *t, FILE *f, JsonFormatFlags json_flags);
+
+int table_print_with_pager(Table *t, JsonFormatFlags json_format_flags, PagerFlags pager_flags, bool show_header);
+
+int table_set_json_field_name(Table *t, size_t idx, const char *name);
+
+#define table_log_add_error(r) \
+ log_error_errno(r, "Failed to add cells to table: %m")
+
+#define table_log_print_error(r) \
+ log_error_errno(r, "Failed to print table: %m")
+
+#define table_log_sort_error(r) \
+ log_error_errno(r, "Failed to sort table: %m")
diff --git a/src/shared/fsck-util.h b/src/shared/fsck-util.h
new file mode 100644
index 0000000..855137c
--- /dev/null
+++ b/src/shared/fsck-util.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* exit codes as defined in fsck(8) */
+enum {
+ FSCK_SUCCESS = 0,
+ FSCK_ERROR_CORRECTED = 1 << 0,
+ FSCK_SYSTEM_SHOULD_REBOOT = 1 << 1,
+ FSCK_ERRORS_LEFT_UNCORRECTED = 1 << 2,
+ FSCK_OPERATIONAL_ERROR = 1 << 3,
+ FSCK_USAGE_OR_SYNTAX_ERROR = 1 << 4,
+ FSCK_USER_CANCELLED = 1 << 5,
+ FSCK_SHARED_LIB_ERROR = 1 << 7,
+};
diff --git a/src/shared/fstab-util.c b/src/shared/fstab-util.c
new file mode 100644
index 0000000..55e76b6
--- /dev/null
+++ b/src/shared/fstab-util.c
@@ -0,0 +1,366 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "alloc-util.h"
+#include "device-nodes.h"
+#include "fstab-util.h"
+#include "initrd-util.h"
+#include "macro.h"
+#include "mount-util.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "proc-cmdline.h"
+#include "string-util.h"
+#include "strv.h"
+
+bool fstab_enabled_full(int enabled) {
+ static int cached = -1;
+ bool val = true; /* If nothing specified or the check fails, then defaults to true. */
+ int r;
+
+ /* If 'enabled' is non-negative, then update the cache with it. */
+ if (enabled >= 0)
+ cached = enabled;
+
+ if (cached >= 0)
+ return cached;
+
+ r = proc_cmdline_get_bool("fstab", PROC_CMDLINE_STRIP_RD_PREFIX|PROC_CMDLINE_TRUE_WHEN_MISSING, &val);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse fstab= kernel command line option, ignoring: %m");
+
+ return (cached = val);
+}
+
+int fstab_has_fstype(const char *fstype) {
+ _cleanup_endmntent_ FILE *f = NULL;
+ struct mntent *m;
+
+ assert(fstype);
+
+ if (!fstab_enabled())
+ return false;
+
+ f = setmntent(fstab_path(), "re");
+ if (!f)
+ return errno == ENOENT ? false : -errno;
+
+ for (;;) {
+ errno = 0;
+ m = getmntent(f);
+ if (!m)
+ return errno != 0 ? -errno : false;
+
+ if (streq(m->mnt_type, fstype))
+ return true;
+ }
+ return false;
+}
+
+bool fstab_is_extrinsic(const char *mount, const char *opts) {
+
+ /* Don't bother with the OS data itself */
+ if (PATH_IN_SET(mount,
+ "/",
+ "/usr",
+ "/etc"))
+ return true;
+
+ if (PATH_STARTSWITH_SET(mount,
+ "/run/initramfs", /* This should stay around from before we boot until after we shutdown */
+ "/run/nextroot", /* Similar (though might be updated from the host) */
+ "/proc", /* All of this is API VFS */
+ "/sys", /* … dito … */
+ "/dev")) /* … dito … */
+ return true;
+
+ /* If this is an initrd mount, and we are not in the initrd, then leave
+ * this around forever, too. */
+ if (fstab_test_option(opts, "x-initrd.mount\0") && !in_initrd())
+ return true;
+
+ return false;
+}
+
+static int fstab_is_same_node(const char *what_fstab, const char *path) {
+ _cleanup_free_ char *node = NULL;
+
+ assert(what_fstab);
+ assert(path);
+
+ node = fstab_node_to_udev_node(what_fstab);
+ if (!node)
+ return -ENOMEM;
+
+ if (path_equal(node, path))
+ return true;
+
+ if (is_device_path(path) && is_device_path(node))
+ return devnode_same(node, path);
+
+ return false;
+}
+
+int fstab_is_mount_point_full(const char *where, const char *path) {
+ _cleanup_endmntent_ FILE *f = NULL;
+ int r;
+
+ assert(where || path);
+
+ if (!fstab_enabled())
+ return false;
+
+ f = setmntent(fstab_path(), "re");
+ if (!f)
+ return errno == ENOENT ? false : -errno;
+
+ for (;;) {
+ struct mntent *me;
+
+ errno = 0;
+ me = getmntent(f);
+ if (!me)
+ return errno != 0 ? -errno : false;
+
+ if (where && !path_equal(where, me->mnt_dir))
+ continue;
+
+ if (!path)
+ return true;
+
+ r = fstab_is_same_node(me->mnt_fsname, path);
+ if (r > 0 || (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r)))
+ return r;
+ }
+
+ return false;
+}
+
+int fstab_filter_options(
+ const char *opts,
+ const char *names,
+ const char **ret_namefound,
+ char **ret_value,
+ char ***ret_values,
+ char **ret_filtered) {
+
+ const char *namefound = NULL, *x;
+ _cleanup_strv_free_ char **stor = NULL, **values = NULL;
+ _cleanup_free_ char *value = NULL, **filtered = NULL;
+ int r;
+
+ assert(names && *names);
+ assert(!(ret_value && ret_values));
+
+ if (!opts)
+ goto answer;
+
+ /* Finds any options matching 'names', and returns:
+ * - the last matching option name in ret_namefound,
+ * - the last matching value in ret_value,
+ * - any matching values in ret_values,
+ * - the rest of the option string in ret_filtered.
+ *
+ * If !ret_value and !ret_values and !ret_filtered, this function is not allowed to fail.
+ *
+ * Returns negative on error, true if any matching options were found, false otherwise. */
+
+ if (ret_filtered || ret_value || ret_values) {
+ /* For backwards compatibility, we need to pass-through escape characters.
+ * The only ones we "consume" are the ones used as "\," or "\\". */
+ r = strv_split_full(&stor, opts, ",", EXTRACT_UNESCAPE_SEPARATORS | EXTRACT_UNESCAPE_RELAX);
+ if (r < 0)
+ return r;
+
+ filtered = memdup(stor, sizeof(char*) * (strv_length(stor) + 1));
+ if (!filtered)
+ return -ENOMEM;
+
+ char **t = filtered;
+ for (char **s = t; *s; s++) {
+ NULSTR_FOREACH(name, names) {
+ x = startswith(*s, name);
+ if (!x)
+ continue;
+ /* Match name, but when ret_values, only when followed by assignment. */
+ if (*x == '=' || (!ret_values && *x == '\0')) {
+ /* Keep the last occurrence found */
+ namefound = name;
+ goto found;
+ }
+ }
+
+ *t = *s;
+ t++;
+ continue;
+ found:
+ if (ret_value || ret_values) {
+ assert(IN_SET(*x, '=', '\0'));
+
+ if (ret_value) {
+ r = free_and_strdup(&value, *x == '=' ? x + 1 : NULL);
+ if (r < 0)
+ return r;
+ } else if (*x) {
+ r = strv_extend(&values, x + 1);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+ *t = NULL;
+ } else
+ for (const char *word = opts;;) {
+ const char *end = word;
+
+ /* Look for a *non-escaped* comma separator. Only commas and backslashes can be
+ * escaped, so "\," and "\\" are the only valid escape sequences, and we can do a
+ * very simple test here. */
+ for (;;) {
+ end += strcspn(end, ",\\");
+
+ if (IN_SET(*end, ',', '\0'))
+ break;
+ assert(*end == '\\');
+ end ++; /* Skip the backslash */
+ if (*end != '\0')
+ end ++; /* Skip the escaped char, but watch out for a trailing comma */
+ }
+
+ NULSTR_FOREACH(name, names) {
+ if (end < word + strlen(name))
+ continue;
+ if (!strneq(word, name, strlen(name)))
+ continue;
+
+ /* We know that the string is NUL terminated, so *x is valid */
+ x = word + strlen(name);
+ if (IN_SET(*x, '\0', '=', ',')) {
+ namefound = name;
+ break;
+ }
+ }
+
+ if (*end)
+ word = end + 1;
+ else
+ break;
+ }
+
+answer:
+ if (ret_namefound)
+ *ret_namefound = namefound;
+ if (ret_filtered) {
+ char *f;
+
+ f = strv_join_full(filtered, ",", NULL, true);
+ if (!f)
+ return -ENOMEM;
+
+ *ret_filtered = f;
+ }
+ if (ret_value)
+ *ret_value = TAKE_PTR(value);
+ if (ret_values)
+ *ret_values = TAKE_PTR(values);
+
+ return !!namefound;
+}
+
+int fstab_find_pri(const char *options, int *ret) {
+ _cleanup_free_ char *opt = NULL;
+ int r, pri;
+
+ assert(ret);
+
+ r = fstab_filter_options(options, "pri\0", NULL, &opt, NULL, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0 || !opt)
+ return 0;
+
+ r = safe_atoi(opt, &pri);
+ if (r < 0)
+ return r;
+
+ *ret = pri;
+ return 1;
+}
+
+static char *unquote(const char *s, const char* quotes) {
+ size_t l;
+ assert(s);
+
+ /* This is rather stupid, simply removes the heading and
+ * trailing quotes if there is one. Doesn't care about
+ * escaping or anything.
+ *
+ * DON'T USE THIS FOR NEW CODE ANYMORE! */
+
+ l = strlen(s);
+ if (l < 2)
+ return strdup(s);
+
+ if (strchr(quotes, s[0]) && s[l-1] == s[0])
+ return strndup(s+1, l-2);
+
+ return strdup(s);
+}
+
+static char *tag_to_udev_node(const char *tagvalue, const char *by) {
+ _cleanup_free_ char *t = NULL, *u = NULL;
+ size_t enc_len;
+
+ u = unquote(tagvalue, QUOTES);
+ if (!u)
+ return NULL;
+
+ enc_len = strlen(u) * 4 + 1;
+ t = new(char, enc_len);
+ if (!t)
+ return NULL;
+
+ if (encode_devnode_name(u, t, enc_len) < 0)
+ return NULL;
+
+ return strjoin("/dev/disk/by-", by, "/", t);
+}
+
+char *fstab_node_to_udev_node(const char *p) {
+ const char *q;
+
+ assert(p);
+
+ q = startswith(p, "LABEL=");
+ if (q)
+ return tag_to_udev_node(q, "label");
+
+ q = startswith(p, "UUID=");
+ if (q)
+ return tag_to_udev_node(q, "uuid");
+
+ q = startswith(p, "PARTUUID=");
+ if (q)
+ return tag_to_udev_node(q, "partuuid");
+
+ q = startswith(p, "PARTLABEL=");
+ if (q)
+ return tag_to_udev_node(q, "partlabel");
+
+ return strdup(p);
+}
+
+bool fstab_is_bind(const char *options, const char *fstype) {
+
+ if (fstab_test_option(options, "bind\0" "rbind\0"))
+ return true;
+
+ if (fstype && STR_IN_SET(fstype, "bind", "rbind"))
+ return true;
+
+ return false;
+}
diff --git a/src/shared/fstab-util.h b/src/shared/fstab-util.h
new file mode 100644
index 0000000..9cf34f0
--- /dev/null
+++ b/src/shared/fstab-util.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "macro.h"
+
+bool fstab_enabled_full(int enabled);
+static inline bool fstab_enabled(void) {
+ return fstab_enabled_full(-1);
+}
+static inline bool fstab_set_enabled(bool enabled) {
+ return fstab_enabled_full(enabled);
+}
+
+bool fstab_is_extrinsic(const char *mount, const char *opts);
+int fstab_has_fstype(const char *fstype);
+
+int fstab_is_mount_point_full(const char *where, const char *path);
+static inline int fstab_is_mount_point(const char *where) {
+ return fstab_is_mount_point_full(where, NULL);
+}
+static inline int fstab_has_node(const char *path) {
+ return fstab_is_mount_point_full(NULL, path);
+}
+
+int fstab_filter_options(
+ const char *opts,
+ const char *names,
+ const char **ret_namefound,
+ char **ret_value,
+ char ***ret_values,
+ char **ret_filtered);
+
+static inline bool fstab_test_option(const char *opts, const char *names) {
+ return !!fstab_filter_options(opts, names, NULL, NULL, NULL, NULL);
+}
+
+int fstab_find_pri(const char *options, int *ret);
+
+static inline bool fstab_test_yes_no_option(const char *opts, const char *yes_no) {
+ const char *opt;
+
+ /* If first name given is last, return 1.
+ * If second name given is last or neither is found, return 0. */
+
+ assert_se(fstab_filter_options(opts, yes_no, &opt, NULL, NULL, NULL) >= 0);
+
+ return opt == yes_no;
+}
+
+char *fstab_node_to_udev_node(const char *p);
+
+static inline const char* fstab_path(void) {
+ return secure_getenv("SYSTEMD_FSTAB") ?: "/etc/fstab";
+}
+
+bool fstab_is_bind(const char *options, const char *fstype);
diff --git a/src/shared/generate-ip-protocol-list.sh b/src/shared/generate-ip-protocol-list.sh
new file mode 100755
index 0000000..ff898a9
--- /dev/null
+++ b/src/shared/generate-ip-protocol-list.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+set -eu
+set -o pipefail
+
+${1:?} -dM -include netinet/in.h - </dev/null | \
+ awk '/^#define[ \t]+IPPROTO_[^ \t]+[ \t]+[^ \t]/ { print $2; }' | \
+ sed -e 's/IPPROTO_//'
diff --git a/src/shared/generate-syscall-list.py b/src/shared/generate-syscall-list.py
new file mode 100755
index 0000000..c0975a0
--- /dev/null
+++ b/src/shared/generate-syscall-list.py
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+import sys
+
+for line in open(sys.argv[1]):
+ print('"{}\\0"'.format(line.strip()))
diff --git a/src/shared/generator.c b/src/shared/generator.c
new file mode 100644
index 0000000..5626587
--- /dev/null
+++ b/src/shared/generator.c
@@ -0,0 +1,888 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "cgroup-util.h"
+#include "dropin.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fstab-util.h"
+#include "generator.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "special.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "time-util.h"
+#include "tmpfile-util.h"
+#include "unit-name.h"
+
+int generator_open_unit_file_full(
+ const char *dir,
+ const char *source,
+ const char *fn,
+ FILE **ret_file,
+ char **ret_temp_path) {
+
+ _cleanup_free_ char *p = NULL;
+ FILE *f;
+ int r;
+
+ assert(dir);
+ assert(ret_file);
+
+ /* If <ret_temp_path> is specified, it creates a temporary unit file and also returns its
+ * temporary path. */
+
+ if (ret_temp_path) {
+ r = fopen_temporary(dir, &f, &p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create temporary unit file in '%s': %m", dir);
+
+ (void) fchmod(fileno(f), 0644);
+
+ *ret_temp_path = TAKE_PTR(p);
+ } else {
+ assert(fn);
+
+ p = path_join(dir, fn);
+ if (!p)
+ return log_oom();
+
+ r = fopen_unlocked(p, "wxe", &f);
+ if (r < 0) {
+ if (source && r == -EEXIST)
+ return log_error_errno(r,
+ "Failed to create unit file '%s', as it already exists. Duplicate entry in '%s'?",
+ p, source);
+
+ return log_error_errno(r, "Failed to create unit file '%s': %m", p);
+ }
+ }
+
+ fprintf(f,
+ "# Automatically generated by %s\n\n",
+ program_invocation_short_name);
+
+ *ret_file = f;
+ return 0;
+}
+
+
+int generator_add_symlink_full(
+ const char *dir,
+ const char *dst,
+ const char *dep_type,
+ const char *src,
+ const char *instance) {
+
+ _cleanup_free_ char *dn = NULL, *fn = NULL, *instantiated = NULL, *to = NULL, *from = NULL;
+ int r;
+
+ assert(dir);
+ assert(dst);
+ assert(dep_type);
+ assert(src);
+
+ /* Adds a symlink from <dst>.<dep_type>/ to <src> (if src is absolute) or ../<src> (otherwise). If
+ * <instance> is specified, then <src> must be a template unit name, and we'll instantiate it. */
+
+ r = path_extract_directory(src, &dn);
+ if (r < 0 && r != -EDESTADDRREQ) /* EDESTADDRREQ → just a file name was passed */
+ return log_error_errno(r, "Failed to extract directory name from '%s': %m", src);
+
+ r = path_extract_filename(src, &fn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract file name from '%s': %m", src);
+ if (r == O_DIRECTORY)
+ return log_error_errno(SYNTHETIC_ERRNO(EISDIR), "Expected path to regular file name, but got '%s', refusing.", src);
+
+ if (instance) {
+ r = unit_name_replace_instance(fn, instance, &instantiated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to instantiate '%s' for '%s': %m", fn, instance);
+ }
+
+ from = path_join(dn ?: "..", fn);
+ if (!from)
+ return log_oom();
+
+ to = strjoin(dir, "/", dst, ".", dep_type, "/", instantiated ?: fn);
+ if (!to)
+ return log_oom();
+
+ (void) mkdir_parents_label(to, 0755);
+
+ if (symlink(from, to) < 0 && errno != EEXIST)
+ return log_error_errno(errno, "Failed to create symlink \"%s\": %m", to);
+
+ return 0;
+}
+
+static int generator_add_ordering(
+ const char *dir,
+ const char *src,
+ const char *order,
+ const char *dst,
+ const char *instance) {
+
+ _cleanup_free_ char *instantiated = NULL, *p = NULL, *fn = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ const char *to;
+ int r;
+
+ assert(dir);
+ assert(src);
+ assert(order);
+ assert(dst);
+
+ /* Adds in an explicit ordering dependency of type <order> from <src> to <dst>. If <instance> is
+ * specified, it is inserted into <dst>. */
+
+ if (instance) {
+ r = unit_name_replace_instance(dst, instance, &instantiated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to instantiate '%s' for '%s': %m", dst, instance);
+
+ to = instantiated;
+ } else
+ to = dst;
+
+ fn = strjoin(src, ".d/50-order-", to, ".conf");
+ if (!fn)
+ return log_oom();
+
+ p = path_join(dir, fn);
+ if (!p)
+ return log_oom();
+
+ (void) mkdir_parents_label(p, 0755);
+
+ r = fopen_unlocked(p, "wxe", &f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create '%s': %m", p);
+
+ fprintf(f,
+ "# Automatically generated by %s\n\n"
+ "[Unit]\n"
+ "%s=%s\n",
+ program_invocation_short_name,
+ order,
+ to);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write drop-in '%s': %m", p);
+
+ return 0;
+}
+
+static int write_fsck_sysroot_service(
+ const char *unit, /* Either SPECIAL_FSCK_ROOT_SERVICE or SPECIAL_FSCK_USR_SERVICE */
+ const char *dir,
+ const char *what,
+ const char *extra_after) {
+
+ _cleanup_free_ char *device = NULL, *escaped = NULL, *escaped2 = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(unit);
+ assert(dir);
+ assert(what);
+
+ /* Writes out special versions of systemd-fsck-root.service and systemd-fsck-usr.service for use in
+ * the initrd. The regular statically shipped versions of these unit files use / and /usr for as
+ * paths, which doesn't match what we need for the initrd (where the dirs are /sysroot +
+ * /sysusr/usr), hence we overwrite those versions here. */
+
+ escaped = specifier_escape(what);
+ if (!escaped)
+ return log_oom();
+
+ escaped2 = cescape(escaped);
+ if (!escaped2)
+ return log_oom();
+
+ r = unit_name_from_path(what, ".device", &device);
+ if (r < 0)
+ return log_error_errno(r, "Failed to convert device \"%s\" to unit name: %m", what);
+
+ r = generator_open_unit_file(dir, /* source = */ NULL, unit, &f);
+ if (r < 0)
+ return r;
+
+ fprintf(f,
+ "[Unit]\n"
+ "Description=File System Check on %1$s\n"
+ "Documentation=man:%2$s(8)\n"
+ "\n"
+ "DefaultDependencies=no\n"
+ "BindsTo=%3$s\n"
+ "Conflicts=shutdown.target\n"
+ "After=%4$s%5$slocal-fs-pre.target %3$s\n"
+ "Before=shutdown.target\n"
+ "\n"
+ "[Service]\n"
+ "Type=oneshot\n"
+ "RemainAfterExit=yes\n"
+ "ExecStart=" SYSTEMD_FSCK_PATH " %6$s\n"
+ "TimeoutSec=infinity\n",
+ escaped,
+ unit,
+ device,
+ strempty(extra_after),
+ isempty(extra_after) ? "" : " ",
+ escaped2);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write unit %s: %m", unit);
+
+ return 0;
+}
+
+int generator_write_fsck_deps(
+ FILE *f,
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *fstype) {
+
+ int r;
+
+ assert(f);
+ assert(dir);
+ assert(what);
+ assert(where);
+
+ /* Let's do an early exit if we are invoked for the root and /usr/ trees in the initrd, to avoid
+ * generating confusing log messages */
+ if (in_initrd() && PATH_IN_SET(where, "/", "/usr")) {
+ log_debug("Skipping fsck for %s in initrd.", where);
+ return 0;
+ }
+
+ if (!is_device_path(what)) {
+ log_warning("Checking was requested for \"%s\", but it is not a device.", what);
+ return 0;
+ }
+
+ if (!isempty(fstype) && !streq(fstype, "auto")) {
+ r = fsck_exists_for_fstype(fstype);
+ if (r < 0)
+ log_warning_errno(r, "Checking was requested for %s, but couldn't detect if fsck.%s may be used, proceeding: %m", what, fstype);
+ else if (r == 0) {
+ /* treat missing check as essentially OK */
+ log_debug("Checking was requested for %s, but fsck.%s does not exist.", what, fstype);
+ return 0;
+ }
+ } else {
+ r = fsck_exists();
+ if (r < 0)
+ log_warning_errno(r, "Checking was requested for %s, but couldn't detect if the fsck command may be used, proceeding: %m", what);
+ else if (r == 0) {
+ /* treat missing fsck as essentially OK */
+ log_debug("Checking was requested for %s, but the fsck command does not exist.", what);
+ return 0;
+ }
+ }
+
+ if (path_equal(where, "/")) {
+ const char *lnk;
+
+ /* We support running the fsck instance for the root fs while it is already mounted, for
+ * compatibility with non-initrd boots. It's ugly, but it is how it is. Since – unlike for
+ * regular file systems – this means the ordering is reversed (i.e. mount *before* fsck) we
+ * have a separate fsck unit for this, independent of systemd-fsck@.service. */
+
+ lnk = strjoina(dir, "/" SPECIAL_LOCAL_FS_TARGET ".wants/" SPECIAL_FSCK_ROOT_SERVICE);
+
+ (void) mkdir_parents(lnk, 0755);
+ if (symlink(SYSTEM_DATA_UNIT_DIR "/" SPECIAL_FSCK_ROOT_SERVICE, lnk) < 0)
+ return log_error_errno(errno, "Failed to create symlink %s: %m", lnk);
+
+ } else {
+ _cleanup_free_ char *_fsck = NULL;
+ const char *fsck, *dep;
+
+ if (in_initrd() && path_equal(where, "/sysroot")) {
+ r = write_fsck_sysroot_service(SPECIAL_FSCK_ROOT_SERVICE, dir, what, SPECIAL_INITRD_ROOT_DEVICE_TARGET);
+ if (r < 0)
+ return r;
+
+ fsck = SPECIAL_FSCK_ROOT_SERVICE;
+ dep = "Requires";
+
+ } else if (in_initrd() && path_equal(where, "/sysusr/usr")) {
+ r = write_fsck_sysroot_service(SPECIAL_FSCK_USR_SERVICE, dir, what, NULL);
+ if (r < 0)
+ return r;
+
+ fsck = SPECIAL_FSCK_USR_SERVICE;
+ dep = "Requires";
+ } else {
+ /* When this is /usr, then let's add a Wants= dependency, otherwise a Requires=
+ * dependency. Why? We can't possibly unmount /usr during shutdown, but if we have a
+ * Requires= from /usr onto a fsck@.service unit and that unit is shut down, then
+ * we'd have to unmount /usr too. */
+
+ dep = path_equal(where, "/usr") ? "Wants" : "Requires";
+
+ r = unit_name_from_path_instance("systemd-fsck", what, ".service", &_fsck);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create fsck service name: %m");
+
+ fsck = _fsck;
+ }
+
+ fprintf(f,
+ "%1$s=%2$s\n"
+ "After=%2$s\n",
+ dep, fsck);
+ }
+
+ return 0;
+}
+
+int generator_write_timeouts(
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *opts,
+ char **filtered) {
+
+ /* Configure how long we wait for a device that backs a mount point or a
+ * swap partition to show up. This is useful to support endless device timeouts
+ * for devices that show up only after user input, like crypto devices. */
+
+ _cleanup_free_ char *node = NULL, *unit = NULL, *timeout = NULL;
+ usec_t u;
+ int r;
+
+ r = fstab_filter_options(opts, "comment=systemd.device-timeout\0"
+ "x-systemd.device-timeout\0",
+ NULL, &timeout, NULL, filtered);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to parse fstab options, ignoring: %m");
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ r = parse_sec_fix_0(timeout, &u);
+ if (r < 0) {
+ log_warning("Failed to parse timeout for %s, ignoring: %s", where, timeout);
+ return 0;
+ }
+
+ node = fstab_node_to_udev_node(what);
+ if (!node)
+ return log_oom();
+ if (!is_device_path(node)) {
+ log_warning("x-systemd.device-timeout ignored for %s", what);
+ return 0;
+ }
+
+ r = unit_name_from_path(node, ".device", &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path: %m");
+
+ return write_drop_in_format(dir, unit, 50, "device-timeout",
+ "# Automatically generated by %s\n"
+ "# from supplied options \"%s\"\n\n"
+ "[Unit]\n"
+ "JobRunningTimeoutSec=%s",
+ program_invocation_short_name,
+ opts,
+ timeout);
+}
+
+int generator_write_device_deps(
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *opts) {
+
+ /* fstab records that specify _netdev option should apply the network
+ * ordering on the actual device depending on network connection. If we
+ * are not mounting real device (NFS, CIFS), we rely on _netdev effect
+ * on the mount unit itself. */
+
+ _cleanup_free_ char *node = NULL, *unit = NULL;
+ int r;
+
+ if (fstab_is_extrinsic(where, opts))
+ return 0;
+
+ if (!fstab_test_option(opts, "_netdev\0"))
+ return 0;
+
+ node = fstab_node_to_udev_node(what);
+ if (!node)
+ return log_oom();
+
+ /* Nothing to apply dependencies to. */
+ if (!is_device_path(node))
+ return 0;
+
+ r = unit_name_from_path(node, ".device", &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path \"%s\": %m",
+ node);
+
+ /* See mount_add_default_dependencies for explanation why we create such
+ * dependencies. */
+ return write_drop_in_format(dir, unit, 50, "netdev-dependencies",
+ "# Automatically generated by %s\n\n"
+ "[Unit]\n"
+ "After=" SPECIAL_NETWORK_ONLINE_TARGET " " SPECIAL_NETWORK_TARGET "\n"
+ "Wants=" SPECIAL_NETWORK_ONLINE_TARGET "\n",
+ program_invocation_short_name);
+}
+
+int generator_write_initrd_root_device_deps(const char *dir, const char *what) {
+ _cleanup_free_ char *unit = NULL;
+ int r;
+
+ r = unit_name_from_path(what, ".device", &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path \"%s\": %m",
+ what);
+
+ return write_drop_in_format(dir, SPECIAL_INITRD_ROOT_DEVICE_TARGET, 50, "root-device",
+ "# Automatically generated by %s\n\n"
+ "[Unit]\n"
+ "Requires=%s\n"
+ "After=%s",
+ program_invocation_short_name,
+ unit,
+ unit);
+}
+
+int generator_hook_up_mkswap(
+ const char *dir,
+ const char *what) {
+
+ _cleanup_free_ char *node = NULL, *unit = NULL, *escaped = NULL, *where_unit = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(dir);
+ assert(what);
+
+ node = fstab_node_to_udev_node(what);
+ if (!node)
+ return log_oom();
+
+ /* Nothing to work on. */
+ if (!is_device_path(node))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot format something that is not a device node: %s",
+ node);
+
+ r = unit_name_from_path_instance("systemd-mkswap", node, ".service", &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit instance name from path \"%s\": %m",
+ node);
+
+ escaped = cescape(node);
+ if (!escaped)
+ return log_oom();
+
+ r = unit_name_from_path(what, ".swap", &where_unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path \"%s\": %m",
+ what);
+
+ r = generator_open_unit_file(dir, /* source = */ NULL, unit, &f);
+ if (r < 0)
+ return r;
+
+ fprintf(f,
+ "[Unit]\n"
+ "Description=Make Swap on %%f\n"
+ "Documentation=man:systemd-mkswap@.service(8)\n"
+ "\n"
+ "DefaultDependencies=no\n"
+ "BindsTo=%%i.device\n"
+ "After=%%i.device\n"
+ "Before=%s\n"
+ "Conflicts=shutdown.target\n"
+ "Before=shutdown.target\n"
+ "\n"
+ "[Service]\n"
+ "Type=oneshot\n"
+ "RemainAfterExit=yes\n"
+ "ExecStart="SYSTEMD_MAKEFS_PATH " swap %s\n"
+ "TimeoutSec=infinity\n",
+ where_unit,
+ escaped);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write unit %s: %m", unit);
+
+ return generator_add_symlink(dir, where_unit, "requires", unit);
+}
+
+int generator_hook_up_mkfs(
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *type) {
+
+ _cleanup_free_ char *node = NULL, *unit = NULL, *escaped = NULL, *where_unit = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ const char *fsck_unit;
+ int r;
+
+ assert(dir);
+ assert(what);
+ assert(where);
+
+ node = fstab_node_to_udev_node(what);
+ if (!node)
+ return log_oom();
+
+ /* Nothing to work on. */
+ if (!is_device_path(node))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot format something that is not a device node: %s",
+ node);
+
+ if (!type || streq(type, "auto"))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot format partition %s, filesystem type is not specified",
+ node);
+
+ r = unit_name_from_path_instance("systemd-makefs", node, ".service", &unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit instance name from path \"%s\": %m",
+ node);
+
+ if (in_initrd() && path_equal(where, "/sysroot"))
+ fsck_unit = SPECIAL_FSCK_ROOT_SERVICE;
+ else if (in_initrd() && path_equal(where, "/sysusr/usr"))
+ fsck_unit = SPECIAL_FSCK_USR_SERVICE;
+ else
+ fsck_unit = "systemd-fsck@%i.service";
+
+ escaped = cescape(node);
+ if (!escaped)
+ return log_oom();
+
+ r = unit_name_from_path(where, ".mount", &where_unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path \"%s\": %m",
+ where);
+
+ r = generator_open_unit_file(dir, /* source = */ NULL, unit, &f);
+ if (r < 0)
+ return r;
+
+ fprintf(f,
+ "[Unit]\n"
+ "Description=Make File System on %%f\n"
+ "Documentation=man:systemd-makefs@.service(8)\n"
+ "\n"
+ "DefaultDependencies=no\n"
+ "BindsTo=%%i.device\n"
+ "After=%%i.device\n"
+ /* fsck might or might not be used, so let's be safe and order
+ * ourselves before both systemd-fsck@.service and the mount unit. */
+ "Before=%s %s\n"
+ "Conflicts=shutdown.target\n"
+ "Before=shutdown.target\n"
+ "\n"
+ "[Service]\n"
+ "Type=oneshot\n"
+ "RemainAfterExit=yes\n"
+ "ExecStart="SYSTEMD_MAKEFS_PATH " %s %s\n"
+ "TimeoutSec=infinity\n",
+ fsck_unit,
+ where_unit,
+ type,
+ escaped);
+ // XXX: what about local-fs-pre.target?
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write unit %s: %m", unit);
+
+ return generator_add_symlink(dir, where_unit, "requires", unit);
+}
+
+int generator_hook_up_growfs(
+ const char *dir,
+ const char *where,
+ const char *target) {
+
+ const char *growfs_unit, *growfs_unit_path;
+ _cleanup_free_ char *where_unit = NULL, *instance = NULL;
+ int r;
+
+ assert(dir);
+ assert(where);
+
+ r = unit_name_from_path(where, ".mount", &where_unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path '%s': %m", where);
+
+ if (empty_or_root(where)) {
+ growfs_unit = SPECIAL_GROWFS_ROOT_SERVICE;
+ growfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_GROWFS_ROOT_SERVICE;
+ } else {
+ growfs_unit = SPECIAL_GROWFS_SERVICE;
+ growfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_GROWFS_SERVICE;
+
+ r = unit_name_path_escape(where, &instance);
+ if (r < 0)
+ return log_error_errno(r, "Failed to escape path '%s': %m", where);
+ }
+
+ if (target) {
+ r = generator_add_ordering(dir, target, "After", growfs_unit, instance);
+ if (r < 0)
+ return r;
+ }
+
+ return generator_add_symlink_full(dir, where_unit, "wants", growfs_unit_path, instance);
+}
+
+int generator_hook_up_pcrfs(
+ const char *dir,
+ const char *where,
+ const char *target) {
+
+ const char *pcrfs_unit, *pcrfs_unit_path;
+ _cleanup_free_ char *where_unit = NULL, *instance = NULL;
+ int r;
+
+ assert(dir);
+ assert(where);
+
+ r = unit_name_from_path(where, ".mount", &where_unit);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make unit name from path '%s': %m", where);
+
+ if (empty_or_root(where)) {
+ pcrfs_unit = SPECIAL_PCRFS_ROOT_SERVICE;
+ pcrfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_PCRFS_ROOT_SERVICE;
+ } else {
+ pcrfs_unit = SPECIAL_PCRFS_SERVICE;
+ pcrfs_unit_path = SYSTEM_DATA_UNIT_DIR "/" SPECIAL_PCRFS_SERVICE;
+
+ r = unit_name_path_escape(where, &instance);
+ if (r < 0)
+ return log_error_errno(r, "Failed to escape path '%s': %m", where);
+ }
+
+ if (target) {
+ r = generator_add_ordering(dir, target, "After", pcrfs_unit, instance);
+ if (r < 0)
+ return r;
+ }
+
+ return generator_add_symlink_full(dir, where_unit, "wants", pcrfs_unit_path, instance);
+}
+
+int generator_enable_remount_fs_service(const char *dir) {
+ /* Pull in systemd-remount-fs.service */
+ return generator_add_symlink(dir, SPECIAL_LOCAL_FS_TARGET, "wants",
+ SYSTEM_DATA_UNIT_DIR "/" SPECIAL_REMOUNT_FS_SERVICE);
+}
+
+int generator_write_blockdev_dependency(
+ FILE *f,
+ const char *what) {
+
+ _cleanup_free_ char *escaped = NULL;
+ int r;
+
+ assert(f);
+ assert(what);
+
+ if (!path_startswith(what, "/dev/"))
+ return 0;
+
+ r = unit_name_path_escape(what, &escaped);
+ if (r < 0)
+ return log_error_errno(r, "Failed to escape device node path %s: %m", what);
+
+ fprintf(f,
+ "After=blockdev@%s.target\n",
+ escaped);
+
+ return 0;
+}
+
+int generator_write_cryptsetup_unit_section(
+ FILE *f,
+ const char *source) {
+
+ assert(f);
+
+ fprintf(f,
+ "[Unit]\n"
+ "Description=Cryptography Setup for %%I\n"
+ "Documentation=man:crypttab(5) man:systemd-cryptsetup-generator(8) man:systemd-cryptsetup@.service(8)\n");
+
+ if (source)
+ fprintf(f, "SourcePath=%s\n", source);
+
+ fprintf(f,
+ "\n"
+ "DefaultDependencies=no\n"
+ "After=cryptsetup-pre.target systemd-udevd-kernel.socket systemd-tpm2-setup-early.service\n"
+ "Before=blockdev@dev-mapper-%%i.target\n"
+ "Wants=blockdev@dev-mapper-%%i.target\n"
+ "IgnoreOnIsolate=true\n");
+
+ return 0;
+}
+
+int generator_write_cryptsetup_service_section(
+ FILE *f,
+ const char *name,
+ const char *what,
+ const char *key_file,
+ const char *options) {
+
+ _cleanup_free_ char *name_escaped = NULL, *what_escaped = NULL, *key_file_escaped = NULL, *options_escaped = NULL;
+
+ assert(f);
+ assert(name);
+ assert(what);
+
+ name_escaped = specifier_escape(name);
+ if (!name_escaped)
+ return log_oom();
+
+ what_escaped = specifier_escape(what);
+ if (!what_escaped)
+ return log_oom();
+
+ if (key_file) {
+ key_file_escaped = specifier_escape(key_file);
+ if (!key_file_escaped)
+ return log_oom();
+ }
+
+ if (options) {
+ options_escaped = specifier_escape(options);
+ if (!options_escaped)
+ return log_oom();
+ }
+
+ fprintf(f,
+ "\n"
+ "[Service]\n"
+ "Type=oneshot\n"
+ "RemainAfterExit=yes\n"
+ "TimeoutSec=infinity\n" /* The binary handles timeouts on its own */
+ "KeyringMode=shared\n" /* Make sure we can share cached keys among instances */
+ "OOMScoreAdjust=500\n" /* Unlocking can allocate a lot of memory if Argon2 is used */
+ "ExecStart=" SYSTEMD_CRYPTSETUP_PATH " attach '%s' '%s' '%s' '%s'\n"
+ "ExecStop=" SYSTEMD_CRYPTSETUP_PATH " detach '%s'\n",
+ name_escaped, what_escaped, strempty(key_file_escaped), strempty(options_escaped),
+ name_escaped);
+
+ return 0;
+}
+
+int generator_write_veritysetup_unit_section(
+ FILE *f,
+ const char *source) {
+
+ assert(f);
+
+ fprintf(f,
+ "[Unit]\n"
+ "Description=Integrity Protection Setup for %%I\n"
+ "Documentation=man:veritytab(5) man:systemd-veritysetup-generator(8) man:systemd-veritysetup@.service(8)\n");
+
+ if (source)
+ fprintf(f, "SourcePath=%s\n", source);
+
+ fprintf(f,
+ "DefaultDependencies=no\n"
+ "IgnoreOnIsolate=true\n"
+ "After=veritysetup-pre.target systemd-udevd-kernel.socket\n"
+ "Before=blockdev@dev-mapper-%%i.target\n"
+ "Wants=blockdev@dev-mapper-%%i.target\n");
+
+ return 0;
+}
+
+int generator_write_veritysetup_service_section(
+ FILE *f,
+ const char *name,
+ const char *data_what,
+ const char *hash_what,
+ const char *roothash,
+ const char *options) {
+
+ _cleanup_free_ char *name_escaped = NULL, *data_what_escaped = NULL, *hash_what_escaped = NULL,
+ *roothash_escaped = NULL, *options_escaped = NULL;
+
+ assert(f);
+ assert(name);
+ assert(data_what);
+ assert(hash_what);
+
+ name_escaped = specifier_escape(name);
+ if (!name_escaped)
+ return log_oom();
+
+ data_what_escaped = specifier_escape(data_what);
+ if (!data_what_escaped)
+ return log_oom();
+
+ hash_what_escaped = specifier_escape(hash_what);
+ if (!hash_what_escaped)
+ return log_oom();
+
+ roothash_escaped = specifier_escape(roothash);
+ if (!roothash_escaped)
+ return log_oom();
+
+ if (options) {
+ options_escaped = specifier_escape(options);
+ if (!options_escaped)
+ return log_oom();
+ }
+
+ fprintf(f,
+ "\n"
+ "[Service]\n"
+ "Type=oneshot\n"
+ "RemainAfterExit=yes\n"
+ "ExecStart=" SYSTEMD_VERITYSETUP_PATH " attach '%s' '%s' '%s' '%s' '%s'\n"
+ "ExecStop=" SYSTEMD_VERITYSETUP_PATH " detach '%s'\n",
+ name_escaped, data_what_escaped, hash_what_escaped, roothash_escaped, strempty(options_escaped),
+ name_escaped);
+
+ return 0;
+}
+
+void log_setup_generator(void) {
+ if (invoked_by_systemd()) {
+ /* Disable talking to syslog/journal (i.e. the two IPC-based loggers) if we run in system context. */
+ if (cg_pid_get_owner_uid(0, NULL) == -ENXIO /* not running in a per-user slice */)
+ log_set_prohibit_ipc(true);
+
+ /* This effectively means: journal for per-user generators, kmsg otherwise */
+ log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
+ }
+
+ log_parse_environment();
+ (void) log_open();
+}
diff --git a/src/shared/generator.h b/src/shared/generator.h
new file mode 100644
index 0000000..d97d6ed
--- /dev/null
+++ b/src/shared/generator.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "macro.h"
+#include "main-func.h"
+
+int generator_open_unit_file_full(const char *dest, const char *source, const char *name, FILE **ret_file, char **ret_temp_path);
+
+static inline int generator_open_unit_file(const char *dest, const char *source, const char *name, FILE **ret_file) {
+ return generator_open_unit_file_full(dest, source, name, ret_file, NULL);
+}
+
+int generator_add_symlink_full(const char *dir, const char *dst, const char *dep_type, const char *src, const char *instance);
+
+static inline int generator_add_symlink(const char *dir, const char *dst, const char *dep_type, const char *src) {
+ return generator_add_symlink_full(dir, dst, dep_type, src, NULL);
+}
+
+int generator_write_fsck_deps(
+ FILE *f,
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *type);
+
+int generator_write_timeouts(
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *opts,
+ char **filtered);
+
+int generator_write_blockdev_dependency(
+ FILE *f,
+ const char *what);
+
+int generator_write_cryptsetup_unit_section(
+ FILE *f,
+ const char *source);
+
+int generator_write_cryptsetup_service_section(
+ FILE *f,
+ const char *name,
+ const char *what,
+ const char *password,
+ const char *options);
+
+int generator_write_veritysetup_unit_section(
+ FILE *f,
+ const char *source);
+
+int generator_write_veritysetup_service_section(
+ FILE *f,
+ const char *name,
+ const char *data_what,
+ const char *hash_what,
+ const char *roothash,
+ const char *options);
+
+int generator_write_device_deps(
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *opts);
+
+int generator_write_initrd_root_device_deps(
+ const char *dir,
+ const char *what);
+
+int generator_hook_up_mkswap(
+ const char *dir,
+ const char *what);
+int generator_hook_up_mkfs(
+ const char *dir,
+ const char *what,
+ const char *where,
+ const char *type);
+int generator_hook_up_growfs(
+ const char *dir,
+ const char *where,
+ const char *target);
+int generator_hook_up_pcrfs(
+ const char *dir,
+ const char *where,
+ const char *target);
+
+int generator_enable_remount_fs_service(const char *dir);
+
+void log_setup_generator(void);
+
+/* Similar to DEFINE_MAIN_FUNCTION, but initializes logging and assigns positional arguments. */
+#define DEFINE_MAIN_GENERATOR_FUNCTION(impl) \
+ _DEFINE_MAIN_FUNCTION( \
+ ({ \
+ log_setup_generator(); \
+ if (!IN_SET(argc, 2, 4)) \
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), \
+ "This program takes one or three arguments."); \
+ }), \
+ impl(argv[1], \
+ argv[argc == 4 ? 2 : 1], \
+ argv[argc == 4 ? 3 : 1]), \
+ r < 0 ? EXIT_FAILURE : EXIT_SUCCESS)
diff --git a/src/shared/geneve-util.c b/src/shared/geneve-util.c
new file mode 100644
index 0000000..36ef9c8
--- /dev/null
+++ b/src/shared/geneve-util.c
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "geneve-util.h"
+#include "string-table.h"
+
+static const char* const geneve_df_table[_NETDEV_GENEVE_DF_MAX] = {
+ [NETDEV_GENEVE_DF_UNSET] = "unset",
+ [NETDEV_GENEVE_DF_SET] = "set",
+ [NETDEV_GENEVE_DF_INHERIT] = "inherit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(geneve_df, GeneveDF);
diff --git a/src/shared/geneve-util.h b/src/shared/geneve-util.h
new file mode 100644
index 0000000..acd0e1a
--- /dev/null
+++ b/src/shared/geneve-util.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <linux/if_link.h>
+
+#include "conf-parser.h"
+
+typedef enum GeneveDF {
+ NETDEV_GENEVE_DF_UNSET = GENEVE_DF_UNSET,
+ NETDEV_GENEVE_DF_SET = GENEVE_DF_SET,
+ NETDEV_GENEVE_DF_INHERIT = GENEVE_DF_INHERIT,
+ _NETDEV_GENEVE_DF_MAX,
+ _NETDEV_GENEVE_DF_INVALID = -EINVAL,
+} GeneveDF;
+
+const char *geneve_df_to_string(GeneveDF d) _const_;
+GeneveDF geneve_df_from_string(const char *d) _pure_;
diff --git a/src/shared/gpt.c b/src/shared/gpt.c
new file mode 100644
index 0000000..d639463
--- /dev/null
+++ b/src/shared/gpt.c
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "gpt.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "utf8.h"
+
+/* Gently push people towards defining GPT type UUIDs for all architectures we know */
+#if !defined(SD_GPT_ROOT_NATIVE) || \
+ !defined(SD_GPT_ROOT_NATIVE_VERITY) || \
+ !defined(SD_GPT_ROOT_NATIVE_VERITY_SIG) || \
+ !defined(SD_GPT_USR_NATIVE) || \
+ !defined(SD_GPT_USR_NATIVE_VERITY) || \
+ !defined(SD_GPT_USR_NATIVE_VERITY_SIG)
+#pragma message "Please define GPT partition types for your architecture."
+#endif
+
+bool partition_designator_is_versioned(PartitionDesignator d) {
+ /* Returns true for all designators where we want to support a concept of "versioning", i.e. which
+ * likely contain software binaries (or hashes thereof) that make sense to be versioned as a
+ * whole. We use this check to automatically pick the newest version of these partitions, by version
+ * comparing the partition labels. */
+
+ return IN_SET(d,
+ PARTITION_ROOT,
+ PARTITION_USR,
+ PARTITION_ROOT_VERITY,
+ PARTITION_USR_VERITY,
+ PARTITION_ROOT_VERITY_SIG,
+ PARTITION_USR_VERITY_SIG);
+}
+
+PartitionDesignator partition_verity_of(PartitionDesignator p) {
+ switch (p) {
+
+ case PARTITION_ROOT:
+ return PARTITION_ROOT_VERITY;
+
+ case PARTITION_USR:
+ return PARTITION_USR_VERITY;
+
+ default:
+ return _PARTITION_DESIGNATOR_INVALID;
+ }
+}
+
+PartitionDesignator partition_verity_sig_of(PartitionDesignator p) {
+ switch (p) {
+
+ case PARTITION_ROOT:
+ return PARTITION_ROOT_VERITY_SIG;
+
+ case PARTITION_USR:
+ return PARTITION_USR_VERITY_SIG;
+
+ default:
+ return _PARTITION_DESIGNATOR_INVALID;
+ }
+}
+
+PartitionDesignator partition_verity_to_data(PartitionDesignator d) {
+ switch (d) {
+
+ case PARTITION_ROOT_VERITY:
+ return PARTITION_ROOT;
+
+ case PARTITION_USR_VERITY:
+ return PARTITION_USR;
+
+ default:
+ return _PARTITION_DESIGNATOR_INVALID;
+ }
+}
+
+PartitionDesignator partition_verity_sig_to_data(PartitionDesignator d) {
+ switch (d) {
+
+ case PARTITION_ROOT_VERITY_SIG:
+ return PARTITION_ROOT;
+
+ case PARTITION_USR_VERITY_SIG:
+ return PARTITION_USR;
+
+ default:
+ return _PARTITION_DESIGNATOR_INVALID;
+ }
+}
+
+static const char *const partition_designator_table[_PARTITION_DESIGNATOR_MAX] = {
+ [PARTITION_ROOT] = "root",
+ [PARTITION_USR] = "usr",
+ [PARTITION_HOME] = "home",
+ [PARTITION_SRV] = "srv",
+ [PARTITION_ESP] = "esp",
+ [PARTITION_XBOOTLDR] = "xbootldr",
+ [PARTITION_SWAP] = "swap",
+ [PARTITION_ROOT_VERITY] = "root-verity",
+ [PARTITION_USR_VERITY] = "usr-verity",
+ [PARTITION_ROOT_VERITY_SIG] = "root-verity-sig",
+ [PARTITION_USR_VERITY_SIG] = "usr-verity-sig",
+ [PARTITION_TMP] = "tmp",
+ [PARTITION_VAR] = "var",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(partition_designator, PartitionDesignator);
+
+static const char *const partition_mountpoint_table[_PARTITION_DESIGNATOR_MAX] = {
+ [PARTITION_ROOT] = "/\0",
+ [PARTITION_USR] = "/usr\0",
+ [PARTITION_HOME] = "/home\0",
+ [PARTITION_SRV] = "/srv\0",
+ [PARTITION_ESP] = "/efi\0/boot\0",
+ [PARTITION_XBOOTLDR] = "/boot\0",
+ [PARTITION_TMP] = "/var/tmp\0",
+ [PARTITION_VAR] = "/var\0",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(partition_mountpoint, PartitionDesignator);
+
+#define _GPT_ARCH_SEXTET(arch, name) \
+ { SD_GPT_ROOT_##arch, "root-" name, ARCHITECTURE_##arch, .designator = PARTITION_ROOT }, \
+ { SD_GPT_ROOT_##arch##_VERITY, "root-" name "-verity", ARCHITECTURE_##arch, .designator = PARTITION_ROOT_VERITY }, \
+ { SD_GPT_ROOT_##arch##_VERITY_SIG, "root-" name "-verity-sig", ARCHITECTURE_##arch, .designator = PARTITION_ROOT_VERITY_SIG }, \
+ { SD_GPT_USR_##arch, "usr-" name, ARCHITECTURE_##arch, .designator = PARTITION_USR }, \
+ { SD_GPT_USR_##arch##_VERITY, "usr-" name "-verity", ARCHITECTURE_##arch, .designator = PARTITION_USR_VERITY }, \
+ { SD_GPT_USR_##arch##_VERITY_SIG, "usr-" name "-verity-sig", ARCHITECTURE_##arch, .designator = PARTITION_USR_VERITY_SIG }
+
+/* Two special cases: alias aarch64 to arm64, and amd64 to x86-64. The DSP mixes debianisms and CPUisms: for
+ * x86, it uses x86 and x86_64, but for aarch64 it uses arm64. This is confusing, and leads to issues for
+ * callers that have to know which -ism to use for which architecture. But we also don't really want to
+ * change the spec and add new partition labels, so add a user-friendly aliasing here, so that both are
+ * accepted but the end result on disk (ie: the partition label).
+ * So always list the canonical name FIRST, and then any aliases later, so that we can match on aliases,
+ * but always return the canonical name. And never return directly a match on the name, always re-resolve
+ * by UUID so that the canonical entry is always found. */
+
+const GptPartitionType gpt_partition_type_table[] = {
+ _GPT_ARCH_SEXTET(ALPHA, "alpha"),
+ _GPT_ARCH_SEXTET(ARC, "arc"),
+ _GPT_ARCH_SEXTET(ARM, "arm"),
+ _GPT_ARCH_SEXTET(ARM, "armv7l"), /* Alias: must be listed after arm */
+ _GPT_ARCH_SEXTET(ARM64, "arm64"),
+ _GPT_ARCH_SEXTET(ARM64, "aarch64"), /* Alias: must be listed after arm64 */
+ _GPT_ARCH_SEXTET(IA64, "ia64"),
+ _GPT_ARCH_SEXTET(LOONGARCH64, "loongarch64"),
+ _GPT_ARCH_SEXTET(MIPS, "mips"),
+ _GPT_ARCH_SEXTET(MIPS64, "mips64"),
+ _GPT_ARCH_SEXTET(MIPS_LE, "mips-le"),
+ _GPT_ARCH_SEXTET(MIPS64_LE, "mips64-le"),
+ _GPT_ARCH_SEXTET(PARISC, "parisc"),
+ _GPT_ARCH_SEXTET(PPC, "ppc"),
+ _GPT_ARCH_SEXTET(PPC64, "ppc64"),
+ _GPT_ARCH_SEXTET(PPC64_LE, "ppc64-le"),
+ _GPT_ARCH_SEXTET(PPC64_LE, "ppc64le"), /* Alias: must be listed after ppc64-le */
+ _GPT_ARCH_SEXTET(RISCV32, "riscv32"),
+ _GPT_ARCH_SEXTET(RISCV64, "riscv64"),
+ _GPT_ARCH_SEXTET(S390, "s390"),
+ _GPT_ARCH_SEXTET(S390X, "s390x"),
+ _GPT_ARCH_SEXTET(TILEGX, "tilegx"),
+ _GPT_ARCH_SEXTET(X86, "x86"),
+ _GPT_ARCH_SEXTET(X86_64, "x86-64"),
+ _GPT_ARCH_SEXTET(X86_64, "x86_64"), /* Alias: must be listed after x86-64 */
+ _GPT_ARCH_SEXTET(X86_64, "amd64"), /* Alias: must be listed after x86-64 */
+#ifdef SD_GPT_ROOT_NATIVE
+ { SD_GPT_ROOT_NATIVE, "root", native_architecture(), .designator = PARTITION_ROOT },
+ { SD_GPT_ROOT_NATIVE_VERITY, "root-verity", native_architecture(), .designator = PARTITION_ROOT_VERITY },
+ { SD_GPT_ROOT_NATIVE_VERITY_SIG, "root-verity-sig", native_architecture(), .designator = PARTITION_ROOT_VERITY_SIG },
+ { SD_GPT_USR_NATIVE, "usr", native_architecture(), .designator = PARTITION_USR },
+ { SD_GPT_USR_NATIVE_VERITY, "usr-verity", native_architecture(), .designator = PARTITION_USR_VERITY },
+ { SD_GPT_USR_NATIVE_VERITY_SIG, "usr-verity-sig", native_architecture(), .designator = PARTITION_USR_VERITY_SIG },
+#endif
+#ifdef SD_GPT_ROOT_SECONDARY
+ { SD_GPT_ROOT_SECONDARY, "root-secondary", ARCHITECTURE_SECONDARY, .designator = PARTITION_ROOT },
+ { SD_GPT_ROOT_SECONDARY_VERITY, "root-secondary-verity", ARCHITECTURE_SECONDARY, .designator = PARTITION_ROOT_VERITY },
+ { SD_GPT_ROOT_SECONDARY_VERITY_SIG, "root-secondary-verity-sig", ARCHITECTURE_SECONDARY, .designator = PARTITION_ROOT_VERITY_SIG },
+ { SD_GPT_USR_SECONDARY, "usr-secondary", ARCHITECTURE_SECONDARY, .designator = PARTITION_USR },
+ { SD_GPT_USR_SECONDARY_VERITY, "usr-secondary-verity", ARCHITECTURE_SECONDARY, .designator = PARTITION_USR_VERITY },
+ { SD_GPT_USR_SECONDARY_VERITY_SIG, "usr-secondary-verity-sig", ARCHITECTURE_SECONDARY, .designator = PARTITION_USR_VERITY_SIG },
+#endif
+
+ { SD_GPT_ESP, "esp", _ARCHITECTURE_INVALID, .designator = PARTITION_ESP },
+ { SD_GPT_XBOOTLDR, "xbootldr", _ARCHITECTURE_INVALID, .designator = PARTITION_XBOOTLDR },
+ { SD_GPT_SWAP, "swap", _ARCHITECTURE_INVALID, .designator = PARTITION_SWAP },
+ { SD_GPT_HOME, "home", _ARCHITECTURE_INVALID, .designator = PARTITION_HOME },
+ { SD_GPT_SRV, "srv", _ARCHITECTURE_INVALID, .designator = PARTITION_SRV },
+ { SD_GPT_VAR, "var", _ARCHITECTURE_INVALID, .designator = PARTITION_VAR },
+ { SD_GPT_TMP, "tmp", _ARCHITECTURE_INVALID, .designator = PARTITION_TMP },
+ { SD_GPT_USER_HOME, "user-home", _ARCHITECTURE_INVALID, .designator = _PARTITION_DESIGNATOR_INVALID },
+ { SD_GPT_LINUX_GENERIC, "linux-generic", _ARCHITECTURE_INVALID, .designator = _PARTITION_DESIGNATOR_INVALID },
+ {}
+};
+
+static const GptPartitionType *gpt_partition_type_find_by_uuid(sd_id128_t id) {
+
+ FOREACH_ARRAY(t, gpt_partition_type_table, ELEMENTSOF(gpt_partition_type_table) - 1)
+ if (sd_id128_equal(id, t->uuid))
+ return t;
+
+ return NULL;
+}
+
+const char *gpt_partition_type_uuid_to_string(sd_id128_t id) {
+ const GptPartitionType *pt;
+
+ pt = gpt_partition_type_find_by_uuid(id);
+ if (!pt)
+ return NULL;
+
+ return pt->name;
+}
+
+const char *gpt_partition_type_uuid_to_string_harder(
+ sd_id128_t id,
+ char buffer[static SD_ID128_UUID_STRING_MAX]) {
+
+ const char *s;
+
+ assert(buffer);
+
+ s = gpt_partition_type_uuid_to_string(id);
+ if (s)
+ return s;
+
+ return sd_id128_to_uuid_string(id, buffer);
+}
+
+int gpt_partition_type_from_string(const char *s, GptPartitionType *ret) {
+ sd_id128_t id = SD_ID128_NULL;
+ int r;
+
+ assert(s);
+
+ FOREACH_ARRAY(t, gpt_partition_type_table, ELEMENTSOF(gpt_partition_type_table) - 1)
+ if (streq(s, t->name)) {
+ /* Don't return immediately, instead re-resolve by UUID so that we can support
+ * aliases like aarch64 -> arm64 transparently. */
+ id = t->uuid;
+ break;
+ }
+
+ if (sd_id128_is_null(id)) {
+ r = sd_id128_from_string(s, &id);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret)
+ *ret = gpt_partition_type_from_uuid(id);
+
+ return 0;
+}
+
+GptPartitionType gpt_partition_type_override_architecture(GptPartitionType type, Architecture arch) {
+ assert(arch >= 0);
+
+ FOREACH_ARRAY(t, gpt_partition_type_table, ELEMENTSOF(gpt_partition_type_table) - 1)
+ if (t->designator == type.designator && t->arch == arch)
+ return *t;
+
+ /* If we can't find an entry with the same designator and the requested architecture, just return the
+ * original partition type. */
+ return type;
+}
+
+Architecture gpt_partition_type_uuid_to_arch(sd_id128_t id) {
+ const GptPartitionType *pt;
+
+ pt = gpt_partition_type_find_by_uuid(id);
+ if (!pt)
+ return _ARCHITECTURE_INVALID;
+
+ return pt->arch;
+}
+
+int gpt_partition_label_valid(const char *s) {
+ _cleanup_free_ char16_t *recoded = NULL;
+
+ recoded = utf8_to_utf16(s, SIZE_MAX);
+ if (!recoded)
+ return -ENOMEM;
+
+ return char16_strlen(recoded) <= GPT_LABEL_MAX;
+}
+
+GptPartitionType gpt_partition_type_from_uuid(sd_id128_t id) {
+ const GptPartitionType *pt;
+
+ pt = gpt_partition_type_find_by_uuid(id);
+ if (pt)
+ return *pt;
+
+ return (GptPartitionType) {
+ .uuid = id,
+ .arch = _ARCHITECTURE_INVALID,
+ .designator = _PARTITION_DESIGNATOR_INVALID,
+ };
+}
+
+const char *gpt_partition_type_mountpoint_nulstr(GptPartitionType type) {
+ return partition_mountpoint_to_string(type.designator);
+}
+
+bool gpt_partition_type_knows_read_only(GptPartitionType type) {
+ return IN_SET(type.designator,
+ PARTITION_ROOT,
+ PARTITION_USR,
+ /* pretty much implied, but let's set the bit to make things really clear */
+ PARTITION_ROOT_VERITY,
+ PARTITION_USR_VERITY,
+ PARTITION_HOME,
+ PARTITION_SRV,
+ PARTITION_VAR,
+ PARTITION_TMP,
+ PARTITION_XBOOTLDR);
+}
+
+bool gpt_partition_type_knows_growfs(GptPartitionType type) {
+ return IN_SET(type.designator,
+ PARTITION_ROOT,
+ PARTITION_USR,
+ PARTITION_HOME,
+ PARTITION_SRV,
+ PARTITION_VAR,
+ PARTITION_TMP,
+ PARTITION_XBOOTLDR);
+}
+
+bool gpt_partition_type_knows_no_auto(GptPartitionType type) {
+ return IN_SET(type.designator,
+ PARTITION_ROOT,
+ PARTITION_ROOT_VERITY,
+ PARTITION_USR,
+ PARTITION_USR_VERITY,
+ PARTITION_HOME,
+ PARTITION_SRV,
+ PARTITION_VAR,
+ PARTITION_TMP,
+ PARTITION_XBOOTLDR,
+ PARTITION_SWAP);
+}
+
+bool gpt_header_has_signature(const GptHeader *p) {
+ assert(p);
+
+ if (memcmp(p->signature, (const char[8]) { 'E', 'F', 'I', ' ', 'P', 'A', 'R', 'T' }, 8) != 0)
+ return false;
+
+ if (le32toh(p->revision) != UINT32_C(0x00010000)) /* the only known revision of the spec: 1.0 */
+ return false;
+
+ if (le32toh(p->header_size) < sizeof(GptHeader))
+ return false;
+
+ if (le32toh(p->header_size) > 4096) /* larger than a sector? something is off… */
+ return false;
+
+ if (le64toh(p->my_lba) != 1) /* this sector must claim to be at sector offset 1 */
+ return false;
+
+ return true;
+}
diff --git a/src/shared/gpt.h b/src/shared/gpt.h
new file mode 100644
index 0000000..21976e5
--- /dev/null
+++ b/src/shared/gpt.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <uchar.h>
+
+#include "sd-gpt.h"
+#include "sd-id128.h"
+
+#include "architecture.h"
+#include "id128-util.h"
+#include "sparse-endian.h"
+
+/* maximum length of gpt label */
+#define GPT_LABEL_MAX 36
+
+typedef enum PartitionDesignator {
+ PARTITION_ROOT, /* Primary architecture */
+ PARTITION_USR,
+ PARTITION_HOME,
+ PARTITION_SRV,
+ PARTITION_ESP,
+ PARTITION_XBOOTLDR,
+ PARTITION_SWAP,
+ PARTITION_ROOT_VERITY, /* verity data for the PARTITION_ROOT partition */
+ PARTITION_USR_VERITY,
+ PARTITION_ROOT_VERITY_SIG, /* PKCS#7 signature for root hash for the PARTITION_ROOT partition */
+ PARTITION_USR_VERITY_SIG,
+ PARTITION_TMP,
+ PARTITION_VAR,
+ _PARTITION_DESIGNATOR_MAX,
+ _PARTITION_DESIGNATOR_INVALID = -EINVAL,
+} PartitionDesignator;
+
+bool partition_designator_is_versioned(PartitionDesignator d);
+
+PartitionDesignator partition_verity_of(PartitionDesignator p);
+PartitionDesignator partition_verity_sig_of(PartitionDesignator p);
+PartitionDesignator partition_verity_to_data(PartitionDesignator d);
+PartitionDesignator partition_verity_sig_to_data(PartitionDesignator d);
+
+const char* partition_designator_to_string(PartitionDesignator d) _const_;
+PartitionDesignator partition_designator_from_string(const char *name) _pure_;
+
+const char *gpt_partition_type_uuid_to_string(sd_id128_t id);
+const char *gpt_partition_type_uuid_to_string_harder(
+ sd_id128_t id,
+ char buffer[static SD_ID128_UUID_STRING_MAX]);
+
+#define GPT_PARTITION_TYPE_UUID_TO_STRING_HARDER(id) \
+ gpt_partition_type_uuid_to_string_harder((id), (char[SD_ID128_UUID_STRING_MAX]) {})
+
+Architecture gpt_partition_type_uuid_to_arch(sd_id128_t id);
+
+typedef struct GptPartitionType {
+ sd_id128_t uuid;
+ const char *name;
+ Architecture arch;
+ PartitionDesignator designator;
+} GptPartitionType;
+
+extern const GptPartitionType gpt_partition_type_table[];
+
+int gpt_partition_label_valid(const char *s);
+
+GptPartitionType gpt_partition_type_from_uuid(sd_id128_t id);
+int gpt_partition_type_from_string(const char *s, GptPartitionType *ret);
+
+GptPartitionType gpt_partition_type_override_architecture(GptPartitionType type, Architecture arch);
+
+const char *gpt_partition_type_mountpoint_nulstr(GptPartitionType type);
+
+bool gpt_partition_type_knows_read_only(GptPartitionType type);
+bool gpt_partition_type_knows_growfs(GptPartitionType type);
+bool gpt_partition_type_knows_no_auto(GptPartitionType type);
+
+typedef struct {
+ uint8_t partition_type_guid[16];
+ uint8_t unique_partition_guid[16];
+ le64_t starting_lba;
+ le64_t ending_lba;
+ le64_t attributes;
+ char16_t partition_name[36];
+} _packed_ GptPartitionEntry;
+
+typedef struct {
+ char signature[8];
+ le32_t revision;
+ le32_t header_size;
+ le32_t crc32;
+ le32_t reserved;
+ le64_t my_lba;
+ le64_t alternate_lba;
+ le64_t first_usable_lba;
+ le64_t last_usable_lba;
+ uint8_t disk_guid[16];
+ le64_t partition_entry_lba;
+ le32_t number_of_partition_entries;
+ le32_t size_of_partition_entry;
+ le32_t partition_entry_array_crc32;
+} _packed_ GptHeader;
+
+bool gpt_header_has_signature(const GptHeader *p);
diff --git a/src/shared/group-record.c b/src/shared/group-record.c
new file mode 100644
index 0000000..1e33bdf
--- /dev/null
+++ b/src/shared/group-record.c
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "group-record.h"
+#include "strv.h"
+#include "uid-alloc-range.h"
+#include "user-util.h"
+
+GroupRecord* group_record_new(void) {
+ GroupRecord *h;
+
+ h = new(GroupRecord, 1);
+ if (!h)
+ return NULL;
+
+ *h = (GroupRecord) {
+ .n_ref = 1,
+ .disposition = _USER_DISPOSITION_INVALID,
+ .last_change_usec = UINT64_MAX,
+ .gid = GID_INVALID,
+ };
+
+ return h;
+}
+
+static GroupRecord *group_record_free(GroupRecord *g) {
+ if (!g)
+ return NULL;
+
+ free(g->group_name);
+ free(g->realm);
+ free(g->group_name_and_realm_auto);
+ free(g->description);
+
+ strv_free(g->members);
+ free(g->service);
+ strv_free(g->administrators);
+ strv_free_erase(g->hashed_password);
+
+ json_variant_unref(g->json);
+
+ return mfree(g);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(GroupRecord, group_record, group_record_free);
+
+static int dispatch_privileged(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch privileged_dispatch_table[] = {
+ { "hashedPassword", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(GroupRecord, hashed_password), JSON_SAFE },
+ {},
+ };
+
+ return json_dispatch(variant, privileged_dispatch_table, flags, userdata);
+}
+
+static int dispatch_binding(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch binding_dispatch_table[] = {
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(GroupRecord, gid), 0 },
+ {},
+ };
+
+ JsonVariant *m;
+ sd_id128_t mid;
+ int r;
+
+ if (!variant)
+ return 0;
+
+ if (!json_variant_is_object(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name));
+
+ r = sd_id128_get_machine(&mid);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to determine machine ID: %m");
+
+ m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid));
+ if (!m)
+ return 0;
+
+ return json_dispatch(m, binding_dispatch_table, flags, userdata);
+}
+
+static int dispatch_per_machine(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch per_machine_dispatch_table[] = {
+ { "matchMachineId", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 },
+ { "matchHostname", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 },
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(GroupRecord, gid), 0 },
+ { "members", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, members), JSON_RELAX},
+ { "administrators", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, administrators), JSON_RELAX},
+ {},
+ };
+
+ JsonVariant *e;
+ int r;
+
+ if (!variant)
+ return 0;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ bool matching = false;
+ JsonVariant *m;
+
+ if (!json_variant_is_object(e))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name));
+
+ m = json_variant_by_key(e, "matchMachineId");
+ if (m) {
+ r = per_machine_id_match(m, flags);
+ if (r < 0)
+ return r;
+
+ matching = r > 0;
+ }
+
+ if (!matching) {
+ m = json_variant_by_key(e, "matchHostname");
+ if (m) {
+ r = per_machine_hostname_match(m, flags);
+ if (r < 0)
+ return r;
+
+ matching = r > 0;
+ }
+ }
+
+ if (!matching)
+ continue;
+
+ r = json_dispatch(e, per_machine_dispatch_table, flags, userdata);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int dispatch_status(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch status_dispatch_table[] = {
+ { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(GroupRecord, service), JSON_SAFE },
+ {},
+ };
+
+ JsonVariant *m;
+ sd_id128_t mid;
+ int r;
+
+ if (!variant)
+ return 0;
+
+ if (!json_variant_is_object(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name));
+
+ r = sd_id128_get_machine(&mid);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to determine machine ID: %m");
+
+ m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid));
+ if (!m)
+ return 0;
+
+ return json_dispatch(m, status_dispatch_table, flags, userdata);
+}
+
+static int group_record_augment(GroupRecord *h, JsonDispatchFlags json_flags) {
+ assert(h);
+
+ if (!FLAGS_SET(h->mask, USER_RECORD_REGULAR))
+ return 0;
+
+ assert(h->group_name);
+
+ if (!h->group_name_and_realm_auto && h->realm) {
+ h->group_name_and_realm_auto = strjoin(h->group_name, "@", h->realm);
+ if (!h->group_name_and_realm_auto)
+ return json_log_oom(h->json, json_flags);
+ }
+
+ return 0;
+}
+
+int group_record_load(
+ GroupRecord *h,
+ JsonVariant *v,
+ UserRecordLoadFlags load_flags) {
+
+ static const JsonDispatch group_dispatch_table[] = {
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(GroupRecord, group_name), JSON_RELAX},
+ { "realm", JSON_VARIANT_STRING, json_dispatch_realm, offsetof(GroupRecord, realm), 0 },
+ { "description", JSON_VARIANT_STRING, json_dispatch_gecos, offsetof(GroupRecord, description), 0 },
+ { "disposition", JSON_VARIANT_STRING, json_dispatch_user_disposition, offsetof(GroupRecord, disposition), 0 },
+ { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(GroupRecord, service), JSON_SAFE },
+ { "lastChangeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(GroupRecord, last_change_usec), 0 },
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(GroupRecord, gid), 0 },
+ { "members", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, members), JSON_RELAX},
+ { "administrators", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(GroupRecord, administrators), JSON_RELAX},
+
+ { "privileged", JSON_VARIANT_OBJECT, dispatch_privileged, 0, 0 },
+
+ /* Not defined for now, for groups, but let's at least generate sensible errors about it */
+ { "secret", JSON_VARIANT_OBJECT, json_dispatch_unsupported, 0, 0 },
+
+ /* Ignore the perMachine, binding and status stuff here, and process it later, so that it overrides whatever is set above */
+ { "perMachine", JSON_VARIANT_ARRAY, NULL, 0, 0 },
+ { "binding", JSON_VARIANT_OBJECT, NULL, 0, 0 },
+ { "status", JSON_VARIANT_OBJECT, NULL, 0, 0 },
+
+ /* Ignore 'signature', we check it with explicit accessors instead */
+ { "signature", JSON_VARIANT_ARRAY, NULL, 0, 0 },
+ {},
+ };
+
+ JsonDispatchFlags json_flags = USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(load_flags);
+ int r;
+
+ assert(h);
+ assert(!h->json);
+
+ /* Note that this call will leave a half-initialized record around on failure! */
+
+ if ((USER_RECORD_REQUIRE_MASK(load_flags) & (USER_RECORD_SECRET|USER_RECORD_PRIVILEGED)))
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EINVAL), "Secret and privileged section currently not available for groups, refusing.");
+
+ r = user_group_record_mangle(v, load_flags, &h->json, &h->mask);
+ if (r < 0)
+ return r;
+
+ r = json_dispatch(h->json, group_dispatch_table, json_flags, h);
+ if (r < 0)
+ return r;
+
+ /* During the parsing operation above we ignored the 'perMachine', 'binding' and 'status' fields, since we want
+ * them to override the global options. Let's process them now. */
+
+ r = dispatch_per_machine("perMachine", json_variant_by_key(h->json, "perMachine"), json_flags, h);
+ if (r < 0)
+ return r;
+
+ r = dispatch_binding("binding", json_variant_by_key(h->json, "binding"), json_flags, h);
+ if (r < 0)
+ return r;
+
+ r = dispatch_status("status", json_variant_by_key(h->json, "status"), json_flags, h);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(h->mask, USER_RECORD_REGULAR) && !h->group_name)
+ return json_log(h->json, json_flags, SYNTHETIC_ERRNO(EINVAL), "Group name field missing, refusing.");
+
+ r = group_record_augment(h, json_flags);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int group_record_build(GroupRecord **ret, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ _cleanup_(group_record_unrefp) GroupRecord *g = NULL;
+ va_list ap;
+ int r;
+
+ assert(ret);
+
+ va_start(ap, ret);
+ r = json_buildv(&v, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return r;
+
+ g = group_record_new();
+ if (!g)
+ return -ENOMEM;
+
+ r = group_record_load(g, v, USER_RECORD_LOAD_FULL);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(g);
+ return 0;
+}
+
+const char *group_record_group_name_and_realm(GroupRecord *h) {
+ assert(h);
+
+ /* Return the pre-initialized joined string if it is defined */
+ if (h->group_name_and_realm_auto)
+ return h->group_name_and_realm_auto;
+
+ /* If it's not defined then we cannot have a realm */
+ assert(!h->realm);
+ return h->group_name;
+}
+
+UserDisposition group_record_disposition(GroupRecord *h) {
+ assert(h);
+
+ if (h->disposition >= 0)
+ return h->disposition;
+
+ /* If not declared, derive from GID */
+
+ if (!gid_is_valid(h->gid))
+ return _USER_DISPOSITION_INVALID;
+
+ if (h->gid == 0 || h->gid == GID_NOBODY)
+ return USER_INTRINSIC;
+
+ if (gid_is_system(h->gid))
+ return USER_SYSTEM;
+
+ if (gid_is_dynamic(h->gid))
+ return USER_DYNAMIC;
+
+ if (gid_is_container(h->gid))
+ return USER_CONTAINER;
+
+ if (h->gid > INT32_MAX)
+ return USER_RESERVED;
+
+ return USER_REGULAR;
+}
+
+int group_record_clone(GroupRecord *h, UserRecordLoadFlags flags, GroupRecord **ret) {
+ _cleanup_(group_record_unrefp) GroupRecord *c = NULL;
+ int r;
+
+ assert(h);
+ assert(ret);
+
+ c = group_record_new();
+ if (!c)
+ return -ENOMEM;
+
+ r = group_record_load(c, h->json, flags);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(c);
+ return 0;
+}
diff --git a/src/shared/group-record.h b/src/shared/group-record.h
new file mode 100644
index 0000000..f810204
--- /dev/null
+++ b/src/shared/group-record.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+#include "user-record.h"
+
+typedef struct GroupRecord {
+ unsigned n_ref;
+ UserRecordMask mask;
+ bool incomplete;
+
+ char *group_name;
+ char *realm;
+ char *group_name_and_realm_auto;
+
+ char *description;
+
+ UserDisposition disposition;
+ uint64_t last_change_usec;
+
+ gid_t gid;
+
+ char **members;
+
+ char *service;
+
+ /* The following exist mostly so that we can cover the full /etc/gshadow set of fields, we currently
+ * do not actually make use of these */
+ char **administrators; /* maps to 'struct sgrp' .sg_adm field */
+ char **hashed_password; /* maps to 'struct sgrp' .sg_passwd field */
+
+ JsonVariant *json;
+} GroupRecord;
+
+GroupRecord* group_record_new(void);
+GroupRecord* group_record_ref(GroupRecord *g);
+GroupRecord* group_record_unref(GroupRecord *g);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(GroupRecord*, group_record_unref);
+
+int group_record_load(GroupRecord *h, JsonVariant *v, UserRecordLoadFlags flags);
+int group_record_build(GroupRecord **ret, ...);
+int group_record_clone(GroupRecord *g, UserRecordLoadFlags flags, GroupRecord **ret);
+
+const char *group_record_group_name_and_realm(GroupRecord *h);
+UserDisposition group_record_disposition(GroupRecord *h);
diff --git a/src/shared/hibernate-util.c b/src/shared/hibernate-util.c
new file mode 100644
index 0000000..0d215e8
--- /dev/null
+++ b/src/shared/hibernate-util.c
@@ -0,0 +1,520 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2018 Dell Inc.
+***/
+
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <stddef.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "btrfs-util.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "efivars.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hibernate-util.h"
+#include "log.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "strv.h"
+
+#define HIBERNATION_SWAP_THRESHOLD 0.98
+
+void hibernation_device_done(HibernationDevice *device) {
+ assert(device);
+
+ free(device->path);
+}
+
+int read_fiemap(int fd, struct fiemap **ret) {
+ _cleanup_free_ struct fiemap *fiemap = NULL, *result_fiemap = NULL;
+ struct stat statinfo;
+ uint32_t result_extents = 0;
+ uint64_t fiemap_start = 0, fiemap_length;
+ const size_t n_extra = DIV_ROUND_UP(sizeof(struct fiemap), sizeof(struct fiemap_extent));
+
+ assert(fd >= 0);
+ assert(ret);
+
+ if (fstat(fd, &statinfo) < 0)
+ return log_debug_errno(errno, "Cannot determine file size: %m");
+ if (!S_ISREG(statinfo.st_mode))
+ return -ENOTTY;
+ fiemap_length = statinfo.st_size;
+
+ /* Zero this out in case we run on a file with no extents */
+ fiemap = calloc(n_extra, sizeof(struct fiemap_extent));
+ if (!fiemap)
+ return -ENOMEM;
+
+ result_fiemap = malloc_multiply(n_extra, sizeof(struct fiemap_extent));
+ if (!result_fiemap)
+ return -ENOMEM;
+
+ /* XFS filesystem has incorrect implementation of fiemap ioctl and
+ * returns extents for only one block-group at a time, so we need
+ * to handle it manually, starting the next fiemap call from the end
+ * of the last extent
+ */
+ while (fiemap_start < fiemap_length) {
+ *fiemap = (struct fiemap) {
+ .fm_start = fiemap_start,
+ .fm_length = fiemap_length,
+ .fm_flags = FIEMAP_FLAG_SYNC,
+ };
+
+ /* Find out how many extents there are */
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0)
+ return log_debug_errno(errno, "Failed to read extents: %m");
+
+ /* Nothing to process */
+ if (fiemap->fm_mapped_extents == 0)
+ break;
+
+ /* Resize fiemap to allow us to read in the extents, result fiemap has to hold all
+ * the extents for the whole file. Add space for the initial struct fiemap. */
+ if (!greedy_realloc0((void**) &fiemap, n_extra + fiemap->fm_mapped_extents, sizeof(struct fiemap_extent)))
+ return -ENOMEM;
+
+ fiemap->fm_extent_count = fiemap->fm_mapped_extents;
+ fiemap->fm_mapped_extents = 0;
+
+ if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0)
+ return log_debug_errno(errno, "Failed to read extents: %m");
+
+ /* Resize result_fiemap to allow us to copy in the extents */
+ if (!greedy_realloc((void**) &result_fiemap,
+ n_extra + result_extents + fiemap->fm_mapped_extents, sizeof(struct fiemap_extent)))
+ return -ENOMEM;
+
+ memcpy(result_fiemap->fm_extents + result_extents,
+ fiemap->fm_extents,
+ sizeof(struct fiemap_extent) * fiemap->fm_mapped_extents);
+
+ result_extents += fiemap->fm_mapped_extents;
+
+ /* Highly unlikely that it is zero */
+ if (_likely_(fiemap->fm_mapped_extents > 0)) {
+ uint32_t i = fiemap->fm_mapped_extents - 1;
+
+ fiemap_start = fiemap->fm_extents[i].fe_logical +
+ fiemap->fm_extents[i].fe_length;
+
+ if (fiemap->fm_extents[i].fe_flags & FIEMAP_EXTENT_LAST)
+ break;
+ }
+ }
+
+ memcpy(result_fiemap, fiemap, sizeof(struct fiemap));
+ result_fiemap->fm_mapped_extents = result_extents;
+ *ret = TAKE_PTR(result_fiemap);
+ return 0;
+}
+
+static int read_resume_config(dev_t *ret_devno, uint64_t *ret_offset) {
+ _cleanup_free_ char *devno_str = NULL, *offset_str = NULL;
+ uint64_t offset;
+ dev_t devno;
+ int r;
+
+ assert(ret_devno);
+ assert(ret_offset);
+
+ r = read_one_line_file("/sys/power/resume", &devno_str);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read /sys/power/resume: %m");
+
+ r = parse_devnum(devno_str, &devno);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /sys/power/resume devno '%s': %m", devno_str);
+
+ r = read_one_line_file("/sys/power/resume_offset", &offset_str);
+ if (r == -ENOENT) {
+ log_debug_errno(r, "Kernel does not expose resume_offset, skipping.");
+ offset = UINT64_MAX;
+ } else if (r < 0)
+ return log_debug_errno(r, "Failed to read /sys/power/resume_offset: %m");
+ else {
+ r = safe_atou64(offset_str, &offset);
+ if (r < 0)
+ return log_debug_errno(r,
+ "Failed to parse /sys/power/resume_offset '%s': %m", offset_str);
+ }
+
+ if (devno == 0 && offset > 0 && offset != UINT64_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Found resume_offset=%" PRIu64 " but resume= is unset, refusing.", offset);
+
+ *ret_devno = devno;
+ *ret_offset = offset;
+
+ return 0;
+}
+
+/* entry in /proc/swaps */
+typedef struct SwapEntry {
+ char *path;
+ bool swapfile;
+
+ uint64_t size;
+ uint64_t used;
+ int priority;
+
+ /* Not present in original entry */
+ dev_t devno;
+ uint64_t offset;
+} SwapEntry;
+
+typedef struct SwapEntries {
+ SwapEntry *swaps;
+ size_t n_swaps;
+} SwapEntries;
+
+static void swap_entry_done(SwapEntry *entry) {
+ assert(entry);
+
+ free(entry->path);
+}
+
+static void swap_entries_done(SwapEntries *entries) {
+ assert(entries);
+
+ FOREACH_ARRAY(i, entries->swaps, entries->n_swaps)
+ swap_entry_done(i);
+
+ free(entries->swaps);
+}
+
+static int swap_entry_get_resume_config(SwapEntry *swap) {
+ _cleanup_close_ int fd = -EBADF;
+ uint64_t offset_raw;
+ struct stat st;
+ int r;
+
+ assert(swap);
+ assert(swap->path);
+
+ fd = open(swap->path, O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!swap->swapfile) {
+ if (!S_ISBLK(st.st_mode))
+ return -ENOTBLK;
+
+ swap->devno = st.st_rdev;
+ swap->offset = 0;
+ return 0;
+ }
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return r;
+
+ r = get_block_device_fd(fd, &swap->devno);
+ if (r < 0)
+ return r;
+
+ r = fd_is_fs_type(fd, BTRFS_SUPER_MAGIC);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to check if swap file '%s' is on Btrfs: %m", swap->path);
+ if (r > 0) {
+ r = btrfs_get_file_physical_offset_fd(fd, &offset_raw);
+ if (r < 0)
+ return r;
+ } else {
+ _cleanup_free_ struct fiemap *fiemap = NULL;
+
+ r = read_fiemap(fd, &fiemap);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read extent map for swap file '%s': %m", swap->path);
+
+ offset_raw = fiemap->fm_extents[0].fe_physical;
+ }
+
+ swap->offset = offset_raw / page_size();
+ return 0;
+}
+
+static int read_swap_entries(SwapEntries *ret) {
+ _cleanup_(swap_entries_done) SwapEntries entries = {};
+ _cleanup_fclose_ FILE *f = NULL;
+
+ assert(ret);
+
+ f = fopen("/proc/swaps", "re");
+ if (!f)
+ return log_debug_errno(errno, "Failed to open /proc/swaps: %m");
+
+ /* Remove header */
+ (void) fscanf(f, "%*s %*s %*s %*s %*s\n");
+
+ for (unsigned i = 1;; i++) {
+ _cleanup_(swap_entry_done) SwapEntry swap = {};
+ _cleanup_free_ char *type = NULL;
+ int k;
+
+ k = fscanf(f,
+ "%ms " /* device/file path */
+ "%ms " /* type of swap */
+ "%" PRIu64 /* swap size */
+ "%" PRIu64 /* used */
+ "%i" /* priority */
+ "\n",
+ &swap.path, &type, &swap.size, &swap.used, &swap.priority);
+ if (k == EOF)
+ break;
+ if (k != 5)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to parse /proc/swaps line %u.", i);
+
+ if (streq(type, "file")) {
+ if (endswith(swap.path, "\\040(deleted)")) {
+ log_debug("Swap file '%s' has been deleted, ignoring.", swap.path);
+ continue;
+ }
+
+ swap.swapfile = true;
+
+ } else if (streq(type, "partition")) {
+ const char *node;
+
+ node = path_startswith(swap.path, "/dev/");
+ if (node && startswith(node, "zram")) {
+ log_debug("Swap partition '%s' is a zram device, ignoring.", swap.path);
+ continue;
+ }
+
+ swap.swapfile = false;
+
+ } else {
+ log_debug("Swap type %s is not supported for hibernation, ignoring device: %s",
+ type, swap.path);
+ continue;
+ }
+
+ if (!GREEDY_REALLOC(entries.swaps, entries.n_swaps + 1))
+ return log_oom_debug();
+
+ entries.swaps[entries.n_swaps++] = TAKE_STRUCT(swap);
+ }
+
+ *ret = TAKE_STRUCT(entries);
+ return 0;
+}
+
+/* Attempt to find a suitable device for hibernation by parsing /proc/swaps, /sys/power/resume, and
+ * /sys/power/resume_offset.
+ *
+ * Beware:
+ * Never use a device or file that hasn't been somehow specified by a user who would also be entrusted
+ * with full system memory access (for example via /sys/power/resume) or that isn't an already active
+ * swap area! Otherwise various security attacks might become possible, for example an attacker could
+ * silently attach such a device and circumvent full disk encryption when it would be automatically used
+ * for hibernation. Also, having a swap area on top of encryption is not per se enough to protect from all
+ * such attacks.
+ *
+ * Returns:
+ * 1 - Values are set in /sys/power/resume and /sys/power/resume_offset.
+ *
+ * 0 - No values are set in /sys/power/resume and /sys/power/resume_offset.
+ * ret will represent the highest priority swap with most remaining space discovered in /proc/swaps.
+ *
+ * Negative value in the case of error */
+int find_suitable_hibernation_device_full(HibernationDevice *ret_device, uint64_t *ret_size, uint64_t *ret_used) {
+ _cleanup_(swap_entries_done) SwapEntries entries = {};
+ SwapEntry *entry = NULL;
+ uint64_t resume_config_offset;
+ dev_t resume_config_devno;
+ int r;
+
+ assert(!ret_size == !ret_used);
+
+ r = read_resume_config(&resume_config_devno, &resume_config_offset);
+ if (r < 0)
+ return r;
+
+ r = read_swap_entries(&entries);
+ if (r < 0)
+ return r;
+ if (entries.n_swaps == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOSPC), "No swap space available for hibernation.");
+
+ FOREACH_ARRAY(swap, entries.swaps, entries.n_swaps) {
+ r = swap_entry_get_resume_config(swap);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get devno and offset for swap '%s': %m", swap->path);
+ if (swap->devno == 0) {
+ assert(swap->swapfile);
+
+ log_debug("Swap file '%s' is not backed by block device, ignoring: %m", swap->path);
+ continue;
+ }
+
+ if (resume_config_devno > 0) {
+ if (swap->devno == resume_config_devno &&
+ (!swap->swapfile || resume_config_offset == UINT64_MAX || swap->offset == resume_config_offset)) {
+ /* /sys/power/resume (resume=) is set, and the calculated swap file offset
+ * matches with /sys/power/resume_offset. If /sys/power/resume_offset is not
+ * exposed, we can't do proper check anyway, so use the found swap file too. */
+ entry = swap;
+ break;
+ }
+
+ /* If resume= is set, don't try to use other swap spaces. */
+ continue;
+ }
+
+ if (!entry ||
+ swap->priority > entry->priority ||
+ swap->size - swap->used > entry->size - entry->used)
+ entry = swap;
+ }
+
+ if (!entry) {
+ /* No need to check n_swaps == 0, since it's rejected early */
+ assert(resume_config_devno > 0);
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOSPC), "Cannot find swap entry corresponding to /sys/power/resume.");
+ }
+
+ if (ret_device) {
+ char *path;
+
+ if (entry->swapfile) {
+ r = device_path_make_canonical(S_IFBLK, entry->devno, &path);
+ if (r < 0)
+ return log_debug_errno(r,
+ "Failed to format canonical device path for devno '" DEVNUM_FORMAT_STR "': %m",
+ DEVNUM_FORMAT_VAL(entry->devno));
+ } else
+ path = TAKE_PTR(entry->path);
+
+ *ret_device = (HibernationDevice) {
+ .devno = entry->devno,
+ .offset = entry->offset,
+ .path = path,
+ };
+ }
+
+ if (ret_size) {
+ *ret_size = entry->size;
+ *ret_used = entry->used;
+ }
+
+ return resume_config_devno > 0;
+}
+
+static int get_proc_meminfo_active(unsigned long long *ret) {
+ _cleanup_free_ char *active_str = NULL;
+ unsigned long long active;
+ int r;
+
+ assert(ret);
+
+ r = get_proc_field("/proc/meminfo", "Active(anon)", WHITESPACE, &active_str);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to retrieve Active(anon) from /proc/meminfo: %m");
+
+ r = safe_atollu(active_str, &active);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse Active(anon) '%s' from /proc/meminfo: %m", active_str);
+
+ *ret = active;
+ return 0;
+}
+
+int hibernation_is_safe(void) {
+ unsigned long long active;
+ uint64_t size, used;
+ bool resume_set, bypass_space_check;
+ int r;
+
+ bypass_space_check = getenv_bool("SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK") > 0;
+
+ r = find_suitable_hibernation_device_full(NULL, &size, &used);
+ if (r == -ENOSPC && bypass_space_check)
+ /* If we don't have any available swap space at all, and SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK
+ * is set, skip all remaining checks since we can't do that properly anyway. It is quite
+ * possible that the user is using a setup similar to #30083. When we actually perform
+ * hibernation in sleep.c we'll check everything again. */
+ return 0;
+ if (r < 0)
+ return r;
+ resume_set = r > 0;
+
+ if (!resume_set && !is_efi_boot())
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Not running on EFI and resume= is not set. Hibernation is not safe.");
+
+ if (bypass_space_check)
+ return true;
+
+ r = get_proc_meminfo_active(&active);
+ if (r < 0)
+ return r;
+
+ r = active <= (size - used) * HIBERNATION_SWAP_THRESHOLD;
+ log_debug("Detected %s swap for hibernation: Active(anon)=%llu kB, size=%" PRIu64 " kB, used=%" PRIu64 " kB, threshold=%.2g%%",
+ r ? "enough" : "not enough", active, size, used, 100 * HIBERNATION_SWAP_THRESHOLD);
+ if (!r)
+ return -ENOSPC;
+
+ return resume_set;
+}
+
+int write_resume_config(dev_t devno, uint64_t offset, const char *device) {
+ char offset_str[DECIMAL_STR_MAX(uint64_t)];
+ _cleanup_free_ char *path = NULL;
+ const char *devno_str;
+ int r;
+
+ devno_str = FORMAT_DEVNUM(devno);
+ xsprintf(offset_str, "%" PRIu64, offset);
+
+ if (!device) {
+ r = device_path_make_canonical(S_IFBLK, devno, &path);
+ if (r < 0)
+ return log_error_errno(r,
+ "Failed to format canonical device path for devno '" DEVNUM_FORMAT_STR "': %m",
+ DEVNUM_FORMAT_VAL(devno));
+ device = path;
+ }
+
+ /* We write the offset first since it's safer. Note that this file is only available in 4.17+, so
+ * fail gracefully if it doesn't exist and we're only overwriting it with 0. */
+ r = write_string_file("/sys/power/resume_offset", offset_str, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r == -ENOENT) {
+ if (offset != 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Can't configure hibernation offset %" PRIu64 ", kernel does not support /sys/power/resume_offset. Refusing.",
+ offset);
+
+ log_warning_errno(r, "/sys/power/resume_offset is unavailable, skipping writing swap file offset.");
+ } else if (r < 0)
+ return log_error_errno(r,
+ "Failed to write swap file offset %s to /sys/power/resume_offset for device '%s': %m",
+ offset_str, device);
+ else
+ log_debug("Wrote resume_offset=%s for device '%s' to /sys/power/resume_offset.",
+ offset_str, device);
+
+ r = write_string_file("/sys/power/resume", devno_str, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ return log_error_errno(r,
+ "Failed to write device '%s' (%s) to /sys/power/resume: %m",
+ device, devno_str);
+ log_debug("Wrote resume=%s for device '%s' to /sys/power/resume.", devno_str, device);
+
+ return 0;
+}
diff --git a/src/shared/hibernate-util.h b/src/shared/hibernate-util.h
new file mode 100644
index 0000000..2ae10fb
--- /dev/null
+++ b/src/shared/hibernate-util.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <linux/fiemap.h>
+#include <sys/types.h>
+
+/* represents values for /sys/power/resume & /sys/power/resume_offset and the corresponding path */
+typedef struct HibernationDevice {
+ dev_t devno;
+ uint64_t offset; /* in memory pages */
+ char *path;
+} HibernationDevice;
+
+void hibernation_device_done(HibernationDevice *hibernation_device);
+
+int find_suitable_hibernation_device_full(HibernationDevice *ret_device, uint64_t *ret_size, uint64_t *ret_used);
+static inline int find_suitable_hibernation_device(HibernationDevice *ret) {
+ return find_suitable_hibernation_device_full(ASSERT_PTR(ret), NULL, NULL);
+}
+
+int hibernation_is_safe(void);
+
+int write_resume_config(dev_t devno, uint64_t offset, const char *device);
+
+/* Only for test-fiemap */
+int read_fiemap(int fd, struct fiemap **ret);
diff --git a/src/shared/hostname-setup.c b/src/shared/hostname-setup.c
new file mode 100644
index 0000000..137c29a
--- /dev/null
+++ b/src/shared/hostname-setup.c
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hostname-setup.h"
+#include "hostname-util.h"
+#include "log.h"
+#include "macro.h"
+#include "proc-cmdline.h"
+#include "string-table.h"
+#include "string-util.h"
+
+static int sethostname_idempotent_full(const char *s, bool really) {
+ struct utsname u;
+
+ assert(s);
+
+ assert_se(uname(&u) >= 0);
+
+ if (streq_ptr(s, u.nodename))
+ return 0;
+
+ if (really &&
+ sethostname(s, strlen(s)) < 0)
+ return -errno;
+
+ return 1;
+}
+
+int sethostname_idempotent(const char *s) {
+ return sethostname_idempotent_full(s, true);
+}
+
+int shorten_overlong(const char *s, char **ret) {
+ char *h, *p;
+
+ /* Shorten an overlong name to HOST_NAME_MAX or to the first dot,
+ * whatever comes earlier. */
+
+ assert(s);
+
+ h = strdup(s);
+ if (!h)
+ return -ENOMEM;
+
+ if (hostname_is_valid(h, 0)) {
+ *ret = h;
+ return 0;
+ }
+
+ p = strchr(h, '.');
+ if (p)
+ *p = 0;
+
+ strshorten(h, HOST_NAME_MAX);
+
+ if (!hostname_is_valid(h, 0)) {
+ free(h);
+ return -EDOM;
+ }
+
+ *ret = h;
+ return 1;
+}
+
+int read_etc_hostname_stream(FILE *f, char **ret) {
+ int r;
+
+ assert(f);
+ assert(ret);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0) /* EOF without any hostname? the file is empty, let's treat that exactly like no file at all: ENOENT */
+ return -ENOENT;
+
+ /* File may have empty lines or comments, ignore them */
+ if (IN_SET(line[0], '\0', '#'))
+ continue;
+
+ hostname_cleanup(line); /* normalize the hostname */
+
+ if (!hostname_is_valid(line, VALID_HOSTNAME_TRAILING_DOT)) /* check that the hostname we return is valid */
+ return -EBADMSG;
+
+ *ret = TAKE_PTR(line);
+ return 0;
+ }
+}
+
+int read_etc_hostname(const char *path, char **ret) {
+ _cleanup_fclose_ FILE *f = NULL;
+
+ assert(ret);
+
+ if (!path)
+ path = "/etc/hostname";
+
+ f = fopen(path, "re");
+ if (!f)
+ return -errno;
+
+ return read_etc_hostname_stream(f, ret);
+}
+
+void hostname_update_source_hint(const char *hostname, HostnameSource source) {
+ int r;
+
+ /* Why save the value and not just create a flag file? This way we will
+ * notice if somebody sets the hostname directly (not going through hostnamed).
+ */
+
+ if (source == HOSTNAME_DEFAULT) {
+ r = write_string_file("/run/systemd/default-hostname", hostname,
+ WRITE_STRING_FILE_CREATE | WRITE_STRING_FILE_ATOMIC);
+ if (r < 0)
+ log_warning_errno(r, "Failed to create \"/run/systemd/default-hostname\": %m");
+ } else
+ unlink_or_warn("/run/systemd/default-hostname");
+}
+
+int hostname_setup(bool really) {
+ _cleanup_free_ char *b = NULL;
+ const char *hn = NULL;
+ HostnameSource source;
+ bool enoent = false;
+ int r;
+
+ r = proc_cmdline_get_key("systemd.hostname", 0, &b);
+ if (r < 0)
+ log_warning_errno(r, "Failed to retrieve system hostname from kernel command line, ignoring: %m");
+ else if (r > 0) {
+ if (hostname_is_valid(b, VALID_HOSTNAME_TRAILING_DOT)) {
+ hn = b;
+ source = HOSTNAME_TRANSIENT;
+ } else {
+ log_warning("Hostname specified on kernel command line is invalid, ignoring: %s", b);
+ b = mfree(b);
+ }
+ }
+
+ if (!hn) {
+ r = read_etc_hostname(NULL, &b);
+ if (r < 0) {
+ if (r == -ENOENT)
+ enoent = true;
+ else
+ log_warning_errno(r, "Failed to read configured hostname: %m");
+ } else {
+ hn = b;
+ source = HOSTNAME_STATIC;
+ }
+ }
+
+ if (!hn) {
+ _cleanup_free_ char *buf = NULL;
+
+ /* Don't override the hostname if it is already set and not explicitly configured */
+
+ r = gethostname_full(GET_HOSTNAME_ALLOW_LOCALHOST, &buf);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r >= 0) {
+ log_debug("No hostname configured, leaving existing hostname <%s> in place.", buf);
+ return 0;
+ }
+
+ if (enoent)
+ log_info("No hostname configured, using default hostname.");
+
+ hn = b = get_default_hostname();
+ if (!hn)
+ return log_oom();
+
+ source = HOSTNAME_DEFAULT;
+
+ }
+
+ r = sethostname_idempotent_full(hn, really);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to set hostname to <%s>: %m", hn);
+ if (r == 0)
+ log_debug("Hostname was already set to <%s>.", hn);
+ else
+ log_info("Hostname %s to <%s>.",
+ really ? "set" : "would have been set",
+ hn);
+
+ if (really)
+ hostname_update_source_hint(hn, source);
+
+ return r;
+}
+
+static const char* const hostname_source_table[] = {
+ [HOSTNAME_STATIC] = "static",
+ [HOSTNAME_TRANSIENT] = "transient",
+ [HOSTNAME_DEFAULT] = "default",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(hostname_source, HostnameSource);
diff --git a/src/shared/hostname-setup.h b/src/shared/hostname-setup.h
new file mode 100644
index 0000000..6def36c
--- /dev/null
+++ b/src/shared/hostname-setup.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+
+typedef enum HostnameSource {
+ HOSTNAME_STATIC, /* from /etc/hostname */
+ HOSTNAME_TRANSIENT, /* a transient hostname set through systemd, hostnamed, the container manager, or otherwise */
+ HOSTNAME_DEFAULT, /* the os-release default or the compiled-in fallback were used */
+ _HOSTNAME_INVALID = -EINVAL,
+} HostnameSource;
+
+const char* hostname_source_to_string(HostnameSource source) _const_;
+HostnameSource hostname_source_from_string(const char *str) _pure_;
+
+int sethostname_idempotent(const char *s);
+
+int shorten_overlong(const char *s, char **ret);
+
+int read_etc_hostname_stream(FILE *f, char **ret);
+int read_etc_hostname(const char *path, char **ret);
+
+void hostname_update_source_hint(const char *hostname, HostnameSource source);
+int hostname_setup(bool really);
diff --git a/src/shared/hwdb-util.c b/src/shared/hwdb-util.c
new file mode 100644
index 0000000..f67e917
--- /dev/null
+++ b/src/shared/hwdb-util.c
@@ -0,0 +1,712 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <ctype.h>
+#include <stdio.h>
+#include <sys/stat.h>
+
+#include "alloc-util.h"
+#include "conf-files.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hwdb-internal.h"
+#include "hwdb-util.h"
+#include "label-util.h"
+#include "mkdir-label.h"
+#include "nulstr-util.h"
+#include "path-util.h"
+#include "sort-util.h"
+#include "strbuf.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+
+static const char* const conf_file_dirs[] = {
+ "/etc/udev/hwdb.d",
+ UDEVLIBEXECDIR "/hwdb.d",
+ NULL
+};
+
+/*
+ * Generic udev properties, key-value database based on modalias strings.
+ * Uses a Patricia/radix trie to index all matches for efficient lookup.
+ */
+
+/* in-memory trie objects */
+struct trie {
+ struct trie_node *root;
+ struct strbuf *strings;
+
+ size_t nodes_count;
+ size_t children_count;
+ size_t values_count;
+};
+
+struct trie_node {
+ /* prefix, common part for all children of this node */
+ size_t prefix_off;
+
+ /* sorted array of pointers to children nodes */
+ struct trie_child_entry *children;
+ uint8_t children_count;
+
+ /* sorted array of key-value pairs */
+ struct trie_value_entry *values;
+ size_t values_count;
+};
+
+/* children array item with char (0-255) index */
+struct trie_child_entry {
+ uint8_t c;
+ struct trie_node *child;
+};
+
+/* value array item with key-value pairs */
+struct trie_value_entry {
+ size_t key_off;
+ size_t value_off;
+ size_t filename_off;
+ uint32_t line_number;
+ uint16_t file_priority;
+};
+
+static int trie_children_cmp(const struct trie_child_entry *a, const struct trie_child_entry *b) {
+ return CMP(a->c, b->c);
+}
+
+static int node_add_child(struct trie *trie, struct trie_node *node, struct trie_node *node_child, uint8_t c) {
+ struct trie_child_entry *child;
+
+ /* extend array, add new entry, sort for bisection */
+ child = reallocarray(node->children, node->children_count + 1, sizeof(struct trie_child_entry));
+ if (!child)
+ return -ENOMEM;
+
+ node->children = child;
+ trie->children_count++;
+ node->children[node->children_count].c = c;
+ node->children[node->children_count].child = node_child;
+ node->children_count++;
+ typesafe_qsort(node->children, node->children_count, trie_children_cmp);
+ trie->nodes_count++;
+
+ return 0;
+}
+
+static struct trie_node *node_lookup(const struct trie_node *node, uint8_t c) {
+ struct trie_child_entry *child;
+ struct trie_child_entry search;
+
+ search.c = c;
+ child = typesafe_bsearch(&search, node->children, node->children_count, trie_children_cmp);
+ if (child)
+ return child->child;
+ return NULL;
+}
+
+static void trie_node_cleanup(struct trie_node *node) {
+ if (!node)
+ return;
+
+ for (size_t i = 0; i < node->children_count; i++)
+ trie_node_cleanup(node->children[i].child);
+ free(node->children);
+ free(node->values);
+ free(node);
+}
+
+static struct trie* trie_free(struct trie *trie) {
+ if (!trie)
+ return NULL;
+
+ trie_node_cleanup(trie->root);
+ strbuf_free(trie->strings);
+ return mfree(trie);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct trie*, trie_free);
+
+static int trie_values_cmp(const struct trie_value_entry *a, const struct trie_value_entry *b, struct trie *trie) {
+ return strcmp(trie->strings->buf + a->key_off,
+ trie->strings->buf + b->key_off);
+}
+
+static int trie_node_add_value(struct trie *trie, struct trie_node *node,
+ const char *key, const char *value,
+ const char *filename, uint16_t file_priority, uint32_t line_number, bool compat) {
+ ssize_t k, v, fn = 0;
+ struct trie_value_entry *val;
+
+ k = strbuf_add_string(trie->strings, key, strlen(key));
+ if (k < 0)
+ return k;
+ v = strbuf_add_string(trie->strings, value, strlen(value));
+ if (v < 0)
+ return v;
+
+ if (!compat) {
+ fn = strbuf_add_string(trie->strings, filename, strlen(filename));
+ if (fn < 0)
+ return fn;
+ }
+
+ if (node->values_count) {
+ struct trie_value_entry search = {
+ .key_off = k,
+ .value_off = v,
+ };
+
+ val = typesafe_bsearch_r(&search, node->values, node->values_count, trie_values_cmp, trie);
+ if (val) {
+ /* At this point we have 2 identical properties on the same match-string.
+ * Since we process files in order, we just replace the previous value. */
+ val->value_off = v;
+ val->filename_off = fn;
+ val->file_priority = file_priority;
+ val->line_number = line_number;
+ return 0;
+ }
+ }
+
+ /* extend array, add new entry, sort for bisection */
+ val = reallocarray(node->values, node->values_count + 1, sizeof(struct trie_value_entry));
+ if (!val)
+ return -ENOMEM;
+ trie->values_count++;
+ node->values = val;
+ node->values[node->values_count] = (struct trie_value_entry) {
+ .key_off = k,
+ .value_off = v,
+ .filename_off = fn,
+ .file_priority = file_priority,
+ .line_number = line_number,
+ };
+ node->values_count++;
+ typesafe_qsort_r(node->values, node->values_count, trie_values_cmp, trie);
+ return 0;
+}
+
+static int trie_insert(struct trie *trie, struct trie_node *node, const char *search,
+ const char *key, const char *value,
+ const char *filename, uint16_t file_priority, uint32_t line_number, bool compat) {
+ int r = 0;
+
+ for (size_t i = 0;; i++) {
+ size_t p;
+ char c;
+ struct trie_node *child;
+
+ for (p = 0; (c = trie->strings->buf[node->prefix_off + p]); p++) {
+ _cleanup_free_ struct trie_node *new_child = NULL;
+ _cleanup_free_ char *s = NULL;
+ ssize_t off;
+
+ if (c == search[i + p])
+ continue;
+
+ /* split node */
+ new_child = new(struct trie_node, 1);
+ if (!new_child)
+ return -ENOMEM;
+
+ /* move values from parent to child */
+ *new_child = (struct trie_node) {
+ .prefix_off = node->prefix_off + p+1,
+ .children = node->children,
+ .children_count = node->children_count,
+ .values = node->values,
+ .values_count = node->values_count,
+ };
+
+ /* update parent; use strdup() because the source gets realloc()d */
+ s = strndup(trie->strings->buf + node->prefix_off, p);
+ if (!s)
+ return -ENOMEM;
+
+ off = strbuf_add_string(trie->strings, s, p);
+ if (off < 0)
+ return off;
+
+ *node = (struct trie_node) {
+ .prefix_off = off,
+ };
+ r = node_add_child(trie, node, new_child, c);
+ if (r < 0)
+ return r;
+
+ new_child = NULL; /* avoid cleanup */
+ break;
+ }
+ i += p;
+
+ c = search[i];
+ if (c == '\0')
+ return trie_node_add_value(trie, node, key, value, filename, file_priority, line_number, compat);
+
+ child = node_lookup(node, c);
+ if (!child) {
+ _cleanup_free_ struct trie_node *new_child = NULL;
+ ssize_t off;
+
+ /* new child */
+ new_child = new(struct trie_node, 1);
+ if (!new_child)
+ return -ENOMEM;
+
+ off = strbuf_add_string(trie->strings, search + i+1, strlen(search + i+1));
+ if (off < 0)
+ return off;
+
+ *new_child = (struct trie_node) {
+ .prefix_off = off,
+ };
+
+ r = node_add_child(trie, node, new_child, c);
+ if (r < 0)
+ return r;
+
+ child = TAKE_PTR(new_child);
+ return trie_node_add_value(trie, child, key, value, filename, file_priority, line_number, compat);
+ }
+
+ node = child;
+ }
+}
+
+struct trie_f {
+ struct trie *trie;
+ uint64_t strings_off;
+
+ uint64_t nodes_count;
+ uint64_t children_count;
+ uint64_t values_count;
+};
+
+/* calculate the storage space for the nodes, children arrays, value arrays */
+static void trie_store_nodes_size(struct trie_f *trie, struct trie_node *node, bool compat) {
+ for (uint64_t i = 0; i < node->children_count; i++)
+ trie_store_nodes_size(trie, node->children[i].child, compat);
+
+ trie->strings_off += sizeof(struct trie_node_f);
+ for (uint64_t i = 0; i < node->children_count; i++)
+ trie->strings_off += sizeof(struct trie_child_entry_f);
+ for (uint64_t i = 0; i < node->values_count; i++)
+ trie->strings_off += compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f);
+}
+
+static int64_t trie_store_nodes(struct trie_f *trie, FILE *f, struct trie_node *node, bool compat) {
+ _cleanup_free_ struct trie_child_entry_f *children = NULL;
+ int64_t node_off;
+
+ assert(trie);
+ assert(f);
+ assert(node);
+
+ if (node->children_count) {
+ children = new(struct trie_child_entry_f, node->children_count);
+ if (!children)
+ return -ENOMEM;
+ }
+
+ /* post-order recursion */
+ for (uint64_t i = 0; i < node->children_count; i++) {
+ int64_t child_off;
+
+ child_off = trie_store_nodes(trie, f, node->children[i].child, compat);
+ if (child_off < 0)
+ return child_off;
+
+ children[i] = (struct trie_child_entry_f) {
+ .c = node->children[i].c,
+ .child_off = htole64(child_off),
+ };
+ }
+
+ struct trie_node_f n = {
+ .prefix_off = htole64(trie->strings_off + node->prefix_off),
+ .children_count = node->children_count,
+ .values_count = htole64(node->values_count),
+ };
+
+ /* write node */
+ node_off = ftello(f);
+ fwrite(&n, sizeof(struct trie_node_f), 1, f);
+ trie->nodes_count++;
+
+ /* append children array */
+ if (node->children_count) {
+ fwrite(children, sizeof(struct trie_child_entry_f), node->children_count, f);
+ trie->children_count += node->children_count;
+ }
+
+ /* append values array */
+ for (uint64_t i = 0; i < node->values_count; i++) {
+ struct trie_value_entry2_f v = {
+ .key_off = htole64(trie->strings_off + node->values[i].key_off),
+ .value_off = htole64(trie->strings_off + node->values[i].value_off),
+ .filename_off = htole64(trie->strings_off + node->values[i].filename_off),
+ .line_number = htole32(node->values[i].line_number),
+ .file_priority = htole16(node->values[i].file_priority),
+ };
+
+ fwrite(&v, compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f), 1, f);
+ }
+ trie->values_count += node->values_count;
+
+ return node_off;
+}
+
+static int trie_store(struct trie *trie, const char *filename, bool compat) {
+ struct trie_f t = {
+ .trie = trie,
+ .strings_off = sizeof(struct trie_header_f),
+ };
+ _cleanup_(unlink_and_freep) char *filename_tmp = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int64_t pos, root_off, size;
+ int r;
+
+ assert(trie);
+ assert(filename);
+
+ /* calculate size of header, nodes, children entries, value entries */
+ trie_store_nodes_size(&t, trie->root, compat);
+
+ r = fopen_tmpfile_linkable(filename, O_WRONLY|O_CLOEXEC, &filename_tmp, &f);
+ if (r < 0)
+ return r;
+
+ if (fchmod(fileno(f), 0444) < 0)
+ return -errno;
+
+ struct trie_header_f h = {
+ .signature = HWDB_SIG,
+ .tool_version = htole64(PROJECT_VERSION),
+ .header_size = htole64(sizeof(struct trie_header_f)),
+ .node_size = htole64(sizeof(struct trie_node_f)),
+ .child_entry_size = htole64(sizeof(struct trie_child_entry_f)),
+ .value_entry_size = htole64(compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f)),
+ };
+
+ /* write nodes */
+ if (fseeko(f, sizeof(struct trie_header_f), SEEK_SET) < 0)
+ return -errno;
+
+ root_off = trie_store_nodes(&t, f, trie->root, compat);
+ h.nodes_root_off = htole64(root_off);
+ pos = ftello(f);
+ h.nodes_len = htole64(pos - sizeof(struct trie_header_f));
+
+ /* write string buffer */
+ fwrite(trie->strings->buf, trie->strings->len, 1, f);
+ h.strings_len = htole64(trie->strings->len);
+
+ /* write header */
+ size = ftello(f);
+ h.file_size = htole64(size);
+ if (fseeko(f, 0, SEEK_SET) < 0)
+ return -errno;
+ fwrite(&h, sizeof(struct trie_header_f), 1, f);
+
+ r = flink_tmpfile(f, filename_tmp, filename, LINK_TMPFILE_REPLACE|LINK_TMPFILE_SYNC);
+ if (r < 0)
+ return r;
+
+ /* write succeeded */
+
+ log_debug("=== trie on-disk ===");
+ log_debug("size: %8"PRIi64" bytes", size);
+ log_debug("header: %8zu bytes", sizeof(struct trie_header_f));
+ log_debug("nodes: %8"PRIu64" bytes (%8"PRIu64")",
+ t.nodes_count * sizeof(struct trie_node_f), t.nodes_count);
+ log_debug("child pointers: %8"PRIu64" bytes (%8"PRIu64")",
+ t.children_count * sizeof(struct trie_child_entry_f), t.children_count);
+ log_debug("value pointers: %8"PRIu64" bytes (%8"PRIu64")",
+ t.values_count * (compat ? sizeof(struct trie_value_entry_f) : sizeof(struct trie_value_entry2_f)), t.values_count);
+ log_debug("string store: %8zu bytes", trie->strings->len);
+ log_debug("strings start: %8"PRIu64, t.strings_off);
+ return 0;
+}
+
+static int insert_data(struct trie *trie, char **match_list, char *line, const char *filename,
+ uint16_t file_priority, uint32_t line_number, bool compat) {
+ char *value;
+
+ assert(line[0] == ' ');
+
+ value = strchr(line, '=');
+ if (!value)
+ return log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL),
+ "Key-value pair expected but got \"%s\", ignoring.", line);
+
+ value[0] = '\0';
+ value++;
+
+ /* Replace multiple leading spaces by a single space */
+ while (isblank(line[0]) && isblank(line[1]))
+ line++;
+
+ if (isempty(line + 1))
+ return log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL),
+ "Empty key in \"%s=%s\", ignoring.",
+ line, value);
+
+ STRV_FOREACH(entry, match_list)
+ trie_insert(trie, trie->root, *entry, line, value, filename, file_priority, line_number, compat);
+
+ return 0;
+}
+
+static int import_file(struct trie *trie, const char *filename, uint16_t file_priority, bool compat) {
+ enum {
+ HW_NONE,
+ HW_MATCH,
+ HW_DATA,
+ } state = HW_NONE;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_strv_free_ char **match_list = NULL;
+ uint32_t line_number = 0;
+ int r, err;
+
+ f = fopen(filename, "re");
+ if (!f)
+ return -errno;
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ size_t len;
+ char *pos;
+
+ r = read_line_full(f, LONG_LINE_MAX, READ_LINE_NOT_A_TTY, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ line_number ++;
+
+ /* comment line */
+ if (line[0] == '#')
+ continue;
+
+ /* strip trailing comment */
+ pos = strchr(line, '#');
+ if (pos)
+ pos[0] = '\0';
+
+ /* strip trailing whitespace */
+ len = strlen(line);
+ while (len > 0 && isspace(line[len-1]))
+ len--;
+ line[len] = '\0';
+
+ switch (state) {
+ case HW_NONE:
+ if (len == 0)
+ break;
+
+ if (line[0] == ' ') {
+ r = log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL),
+ "Match expected but got indented property \"%s\", ignoring line.", line);
+ break;
+ }
+
+ /* start of record, first match */
+ state = HW_MATCH;
+
+ err = strv_extend(&match_list, line);
+ if (err < 0)
+ return err;
+
+ break;
+
+ case HW_MATCH:
+ if (len == 0) {
+ r = log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL),
+ "Property expected, ignoring record with no properties.");
+ state = HW_NONE;
+ match_list = strv_free(match_list);
+ break;
+ }
+
+ if (line[0] != ' ') {
+ /* another match */
+ err = strv_extend(&match_list, line);
+ if (err < 0)
+ return err;
+
+ break;
+ }
+
+ /* first data */
+ state = HW_DATA;
+ err = insert_data(trie, match_list, line, filename, file_priority, line_number, compat);
+ if (err < 0)
+ r = err;
+ break;
+
+ case HW_DATA:
+ if (len == 0) {
+ /* end of record */
+ state = HW_NONE;
+ match_list = strv_free(match_list);
+ break;
+ }
+
+ if (line[0] != ' ') {
+ r = log_syntax(NULL, LOG_WARNING, filename, line_number, SYNTHETIC_ERRNO(EINVAL),
+ "Property or empty line expected, got \"%s\", ignoring record.", line);
+ state = HW_NONE;
+ match_list = strv_free(match_list);
+ break;
+ }
+
+ err = insert_data(trie, match_list, line, filename, file_priority, line_number, compat);
+ if (err < 0)
+ r = err;
+ break;
+ };
+ }
+
+ if (state == HW_MATCH)
+ log_syntax(NULL, LOG_WARNING, filename, line_number, 0,
+ "Property expected, ignoring record with no properties.");
+
+ return r;
+}
+
+int hwdb_update(const char *root, const char *hwdb_bin_dir, bool strict, bool compat) {
+ _cleanup_free_ char *hwdb_bin = NULL;
+ _cleanup_(trie_freep) struct trie *trie = NULL;
+ _cleanup_strv_free_ char **files = NULL;
+ uint16_t file_priority = 1;
+ int r = 0, err;
+
+ /* The argument 'compat' controls the format version of database. If false, then hwdb.bin will be
+ * created with additional information such that priority, line number, and filename of database
+ * source. If true, then hwdb.bin will be created without the information. systemd-hwdb command
+ * should set the argument false, and 'udevadm hwdb' command should set it true. */
+
+ hwdb_bin = path_join(root, hwdb_bin_dir ?: "/etc/udev", "hwdb.bin");
+ if (!hwdb_bin)
+ return -ENOMEM;
+
+ trie = new0(struct trie, 1);
+ if (!trie)
+ return -ENOMEM;
+
+ /* string store */
+ trie->strings = strbuf_new();
+ if (!trie->strings)
+ return -ENOMEM;
+
+ /* index */
+ trie->root = new0(struct trie_node, 1);
+ if (!trie->root)
+ return -ENOMEM;
+
+ trie->nodes_count++;
+
+ err = conf_files_list_strv(&files, ".hwdb", root, 0, conf_file_dirs);
+ if (err < 0)
+ return log_error_errno(err, "Failed to enumerate hwdb files: %m");
+
+ if (strv_isempty(files)) {
+ if (unlink(hwdb_bin) < 0) {
+ if (errno != ENOENT)
+ return log_error_errno(errno, "Failed to remove compiled hwdb database %s: %m", hwdb_bin);
+
+ log_info("No hwdb files found, skipping.");
+ } else
+ log_info("No hwdb files found, compiled hwdb database %s removed.", hwdb_bin);
+
+ return 0;
+ }
+
+ STRV_FOREACH(f, files) {
+ log_debug("Reading file \"%s\"", *f);
+ err = import_file(trie, *f, file_priority++, compat);
+ if (err < 0 && strict)
+ r = err;
+ }
+
+ strbuf_complete(trie->strings);
+
+ log_debug("=== trie in-memory ===");
+ log_debug("nodes: %8zu bytes (%8zu)",
+ trie->nodes_count * sizeof(struct trie_node), trie->nodes_count);
+ log_debug("children arrays: %8zu bytes (%8zu)",
+ trie->children_count * sizeof(struct trie_child_entry), trie->children_count);
+ log_debug("values arrays: %8zu bytes (%8zu)",
+ trie->values_count * sizeof(struct trie_value_entry), trie->values_count);
+ log_debug("strings: %8zu bytes",
+ trie->strings->len);
+ log_debug("strings incoming: %8zu bytes (%8zu)",
+ trie->strings->in_len, trie->strings->in_count);
+ log_debug("strings dedup'ed: %8zu bytes (%8zu)",
+ trie->strings->dedup_len, trie->strings->dedup_count);
+
+ (void) mkdir_parents_label(hwdb_bin, 0755);
+ err = trie_store(trie, hwdb_bin, compat);
+ if (err < 0)
+ return log_error_errno(err, "Failed to write database %s: %m", hwdb_bin);
+
+ err = label_fix(hwdb_bin, 0);
+ if (err < 0)
+ return err;
+
+ return r;
+}
+
+int hwdb_query(const char *modalias, const char *root) {
+ _cleanup_(sd_hwdb_unrefp) sd_hwdb *hwdb = NULL;
+ const char *key, *value;
+ int r;
+
+ assert(modalias);
+
+ if (!isempty(root))
+ NULSTR_FOREACH(p, hwdb_bin_paths) {
+ _cleanup_free_ char *hwdb_bin = NULL;
+
+ hwdb_bin = path_join(root, p);
+ if (!hwdb_bin)
+ return -ENOMEM;
+
+ r = sd_hwdb_new_from_path(hwdb_bin, &hwdb);
+ if (r >= 0)
+ break;
+ }
+ else
+ r = sd_hwdb_new(&hwdb);
+ if (r < 0)
+ return r;
+
+ SD_HWDB_FOREACH_PROPERTY(hwdb, modalias, key, value)
+ printf("%s=%s\n", key, value);
+
+ return 0;
+}
+
+bool hwdb_should_reload(sd_hwdb *hwdb) {
+ bool found = false;
+ struct stat st;
+
+ if (!hwdb)
+ return false;
+ if (!hwdb->f)
+ return false;
+
+ /* if hwdb.bin doesn't exist anywhere, we need to update */
+ NULSTR_FOREACH(p, hwdb_bin_paths)
+ if (stat(p, &st) >= 0) {
+ found = true;
+ break;
+ }
+ if (!found)
+ return true;
+
+ if (timespec_load(&hwdb->st.st_mtim) != timespec_load(&st.st_mtim))
+ return true;
+ return false;
+}
diff --git a/src/shared/hwdb-util.h b/src/shared/hwdb-util.h
new file mode 100644
index 0000000..cb93690
--- /dev/null
+++ b/src/shared/hwdb-util.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-hwdb.h"
+
+bool hwdb_should_reload(sd_hwdb *hwdb);
+int hwdb_update(const char *root, const char *hwdb_bin_dir, bool strict, bool compat);
+int hwdb_query(const char *modalias, const char *root);
diff --git a/src/shared/id128-print.c b/src/shared/id128-print.c
new file mode 100644
index 0000000..c9509b2
--- /dev/null
+++ b/src/shared/id128-print.c
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdio.h>
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "id128-print.h"
+#include "log.h"
+#include "pretty-print.h"
+#include "terminal-util.h"
+
+int id128_pretty_print_sample(const char *name, sd_id128_t id) {
+ _cleanup_free_ char *man_link = NULL, *mod_link = NULL;
+
+ const char *on = ansi_highlight(),
+ *off = ansi_normal();
+
+ if (terminal_urlify("man:systemd-id128(1)", "systemd-id128(1)", &man_link) < 0)
+ return log_oom();
+
+ if (terminal_urlify("https://docs.python.org/3/library/uuid.html", "uuid", &mod_link) < 0)
+ return log_oom();
+
+ printf("As string:\n"
+ "%s" SD_ID128_FORMAT_STR "%s\n\n"
+ "As UUID:\n"
+ "%s" SD_ID128_UUID_FORMAT_STR "%s\n\n"
+ "As %s macro:\n"
+ "%s#define %s SD_ID128_MAKE(",
+ on, SD_ID128_FORMAT_VAL(id), off,
+ on, SD_ID128_FORMAT_VAL(id), off,
+ man_link,
+ on, name);
+ for (size_t i = 0; i < 16; i++)
+ printf("%02x%s", id.bytes[i], i < 15 ? "," : "");
+ printf(")%s\n\n", off);
+
+ printf("As Python constant:\n"
+ ">>> import %s\n"
+ ">>> %s%s = uuid.UUID('" SD_ID128_FORMAT_STR "')%s\n",
+ mod_link,
+ on, name, SD_ID128_FORMAT_VAL(id), off);
+
+ return 0;
+}
+
+
+int id128_pretty_print(sd_id128_t id, Id128PrettyPrintMode mode) {
+ assert(mode >= 0);
+ assert(mode < _ID128_PRETTY_PRINT_MODE_MAX);
+
+ if (mode == ID128_PRINT_ID128) {
+ printf(SD_ID128_FORMAT_STR "\n",
+ SD_ID128_FORMAT_VAL(id));
+ return 0;
+ } else if (mode == ID128_PRINT_UUID) {
+ printf(SD_ID128_UUID_FORMAT_STR "\n",
+ SD_ID128_FORMAT_VAL(id));
+ return 0;
+ } else
+ return id128_pretty_print_sample("XYZ", id);
+}
+
+int id128_print_new(Id128PrettyPrintMode mode) {
+ sd_id128_t id;
+ int r;
+
+ r = sd_id128_randomize(&id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate ID: %m");
+
+ return id128_pretty_print(id, mode);
+}
diff --git a/src/shared/id128-print.h b/src/shared/id128-print.h
new file mode 100644
index 0000000..7b2e593
--- /dev/null
+++ b/src/shared/id128-print.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-id128.h"
+
+typedef enum Id128PrettyPrintMode {
+ ID128_PRINT_ID128,
+ ID128_PRINT_UUID,
+ ID128_PRINT_PRETTY,
+ _ID128_PRETTY_PRINT_MODE_MAX,
+ _ID128_PRETTY_PRINT_MODE_INVALID = -EINVAL,
+} Id128PrettyPrintMode;
+
+int id128_pretty_print_sample(const char *name, sd_id128_t id);
+int id128_pretty_print(sd_id128_t id, Id128PrettyPrintMode mode);
+int id128_print_new(Id128PrettyPrintMode mode);
diff --git a/src/shared/idn-util.c b/src/shared/idn-util.c
new file mode 100644
index 0000000..6f36688
--- /dev/null
+++ b/src/shared/idn-util.c
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_LIBIDN2
+# include <idn2.h>
+#elif HAVE_LIBIDN
+# include <idna.h>
+# include <stringprep.h>
+#endif
+
+#include "alloc-util.h"
+#include "dlfcn-util.h"
+#include "idn-util.h"
+
+#if HAVE_LIBIDN || HAVE_LIBIDN2
+static void* idn_dl = NULL;
+#endif
+
+#if HAVE_LIBIDN2
+int (*sym_idn2_lookup_u8)(const uint8_t* src, uint8_t** lookupname, int flags) = NULL;
+const char *(*sym_idn2_strerror)(int rc) _const_ = NULL;
+int (*sym_idn2_to_unicode_8z8z)(const char * input, char ** output, int flags) = NULL;
+
+int dlopen_idn(void) {
+ return dlopen_many_sym_or_warn(
+ &idn_dl, "libidn2.so.0", LOG_DEBUG,
+ DLSYM_ARG(idn2_lookup_u8),
+ DLSYM_ARG(idn2_strerror),
+ DLSYM_ARG(idn2_to_unicode_8z8z));
+}
+#endif
+
+#if HAVE_LIBIDN
+int (*sym_idna_to_ascii_4i)(const uint32_t * in, size_t inlen, char *out, int flags);
+int (*sym_idna_to_unicode_44i)(const uint32_t * in, size_t inlen, uint32_t * out, size_t * outlen, int flags);
+char* (*sym_stringprep_ucs4_to_utf8)(const uint32_t * str, ssize_t len, size_t * items_read, size_t * items_written);
+uint32_t* (*sym_stringprep_utf8_to_ucs4)(const char *str, ssize_t len, size_t *items_written);
+
+int dlopen_idn(void) {
+ _cleanup_(dlclosep) void *dl = NULL;
+ int r;
+
+ if (idn_dl)
+ return 0; /* Already loaded */
+
+ dl = dlopen("libidn.so.12", RTLD_LAZY);
+ if (!dl) {
+ /* libidn broke ABI in 1.34, but not in a way we care about (a new field got added to an
+ * open-coded struct we do not use), hence support both versions. */
+ dl = dlopen("libidn.so.11", RTLD_LAZY);
+ if (!dl)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "libidn support is not installed: %s", dlerror());
+ }
+
+ r = dlsym_many_or_warn(
+ dl,
+ LOG_DEBUG,
+ DLSYM_ARG(idna_to_ascii_4i),
+ DLSYM_ARG(idna_to_unicode_44i),
+ DLSYM_ARG(stringprep_ucs4_to_utf8),
+ DLSYM_ARG(stringprep_utf8_to_ucs4));
+ if (r < 0)
+ return r;
+
+ idn_dl = TAKE_PTR(dl);
+
+ return 1;
+}
+#endif
diff --git a/src/shared/idn-util.h b/src/shared/idn-util.h
new file mode 100644
index 0000000..e64bd99
--- /dev/null
+++ b/src/shared/idn-util.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_LIBIDN2
+# include <idn2.h>
+#elif HAVE_LIBIDN
+# include <idna.h>
+# include <stringprep.h>
+#endif
+
+#include <inttypes.h>
+
+#if HAVE_LIBIDN2 || HAVE_LIBIDN
+int dlopen_idn(void);
+#else
+static inline int dlopen_idn(void) {
+ return -EOPNOTSUPP;
+}
+#endif
+
+#if HAVE_LIBIDN2
+extern int (*sym_idn2_lookup_u8)(const uint8_t* src, uint8_t** lookupname, int flags);
+extern const char *(*sym_idn2_strerror)(int rc) _const_;
+extern int (*sym_idn2_to_unicode_8z8z)(const char * input, char ** output, int flags);
+#endif
+
+#if HAVE_LIBIDN
+extern int (*sym_idna_to_ascii_4i)(const uint32_t * in, size_t inlen, char *out, int flags);
+extern int (*sym_idna_to_unicode_44i)(const uint32_t * in, size_t inlen,uint32_t * out, size_t * outlen, int flags);
+extern char* (*sym_stringprep_ucs4_to_utf8)(const uint32_t * str, ssize_t len, size_t * items_read, size_t * items_written);
+extern uint32_t* (*sym_stringprep_utf8_to_ucs4)(const char *str, ssize_t len, size_t *items_written);
+#endif
diff --git a/src/shared/ima-util.c b/src/shared/ima-util.c
new file mode 100644
index 0000000..e37c9ad
--- /dev/null
+++ b/src/shared/ima-util.c
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "ima-util.h"
+
+static int use_ima_cached = -1;
+
+bool use_ima(void) {
+
+ if (use_ima_cached < 0)
+ use_ima_cached = access("/sys/kernel/security/ima/", F_OK) >= 0;
+
+ return use_ima_cached;
+}
diff --git a/src/shared/ima-util.h b/src/shared/ima-util.h
new file mode 100644
index 0000000..922db78
--- /dev/null
+++ b/src/shared/ima-util.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+bool use_ima(void);
diff --git a/src/shared/image-policy.c b/src/shared/image-policy.c
new file mode 100644
index 0000000..3c3de50
--- /dev/null
+++ b/src/shared/image-policy.c
@@ -0,0 +1,774 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "image-policy.h"
+#include "logarithm.h"
+#include "sort-util.h"
+#include "string-util.h"
+#include "strv.h"
+
+/* Rationale for the chosen syntax:
+ *
+ * → one line, so that it can be reasonably added to a shell command line, for example via `systemd-dissect
+ * --image-policy=…` or to the kernel command line via `systemd.image_policy=`.
+ *
+ * → no use of "," or ";" as separators, so that it can be included in mount/fstab-style option strings and
+ * doesn't require escaping. Instead, separators are ":", "=", "+" which should be fine both in shell
+ * command lines and in mount/fstab style option strings.
+ */
+
+static int partition_policy_compare(const PartitionPolicy *a, const PartitionPolicy *b) {
+ return CMP(ASSERT_PTR(a)->designator, ASSERT_PTR(b)->designator);
+}
+
+static const PartitionPolicy* image_policy_bsearch(const ImagePolicy *policy, PartitionDesignator designator) {
+ if (!policy)
+ return NULL;
+
+ return typesafe_bsearch(
+ &(const PartitionPolicy) { .designator = designator },
+ ASSERT_PTR(policy)->policies,
+ ASSERT_PTR(policy)->n_policies,
+ partition_policy_compare);
+}
+
+PartitionPolicyFlags partition_policy_flags_extend(PartitionPolicyFlags flags) {
+ /* If some parts of a flags field are left unspecified, let's fill in all options. */
+
+ /* If no protection flag is set, then this means all are set */
+ if ((flags & _PARTITION_POLICY_USE_MASK) == 0)
+ flags |= PARTITION_POLICY_OPEN;
+
+ /* If the gpt flags bits are not specified, set both options for each */
+ if ((flags & _PARTITION_POLICY_READ_ONLY_MASK) == 0)
+ flags |= PARTITION_POLICY_READ_ONLY_ON|PARTITION_POLICY_READ_ONLY_OFF;
+
+ if ((flags & _PARTITION_POLICY_GROWFS_MASK) == 0)
+ flags |= PARTITION_POLICY_GROWFS_ON|PARTITION_POLICY_GROWFS_OFF;
+
+ return flags;
+}
+
+static PartitionPolicyFlags partition_policy_normalized_flags(const PartitionPolicy *policy) {
+ PartitionPolicyFlags flags = ASSERT_PTR(policy)->flags;
+
+ /* This normalizes the per-partition policy flags. This means if the user left some things
+ * unspecified, we'll fill in the appropriate "dontcare" policy instead. We'll also mask out bits
+ * that do not make any sense for specific partition types. */
+
+ flags = partition_policy_flags_extend(flags);
+
+ /* If this is a verity or verity signature designator, then mask off all protection bits, this after
+ * all needs no protection, because it *is* the protection */
+ if (partition_verity_to_data(policy->designator) >= 0 ||
+ partition_verity_sig_to_data(policy->designator) >= 0)
+ flags &= ~(PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED);
+
+ /* if this designator has no verity concept, then mask off verity protection flags */
+ if (partition_verity_of(policy->designator) < 0)
+ flags &= ~(PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED);
+
+ /* If the partition must be absent, then the gpt flags don't matter */
+ if ((flags & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_ABSENT)
+ flags &= ~(_PARTITION_POLICY_READ_ONLY_MASK|_PARTITION_POLICY_GROWFS_MASK);
+
+ return flags;
+}
+
+PartitionPolicyFlags image_policy_get(const ImagePolicy *policy, PartitionDesignator designator) {
+ PartitionDesignator data_designator = _PARTITION_DESIGNATOR_INVALID;
+ const PartitionPolicy *pp;
+
+ /* No policy means: everything may be used in any mode */
+ if (!policy)
+ return partition_policy_normalized_flags(
+ &(const PartitionPolicy) {
+ .flags = PARTITION_POLICY_OPEN,
+ .designator = designator,
+ });
+
+ pp = image_policy_bsearch(policy, designator);
+ if (pp)
+ return partition_policy_normalized_flags(pp);
+
+ /* Hmm, so this didn't work, then let's see if we can derive some policy from the underlying data
+ * partition in case of verity/signature partitions */
+
+ data_designator = partition_verity_to_data(designator);
+ if (data_designator >= 0) {
+ PartitionPolicyFlags data_flags;
+
+ /* So we are asked for the policy for a verity partition, and there's no explicit policy for
+ * that case. Let's synthesize a policy from the protection setting for the underlying data
+ * partition. */
+
+ data_flags = image_policy_get(policy, data_designator);
+ if (data_flags < 0)
+ return data_flags;
+
+ /* We need verity if verity or verity with sig is requested */
+ if (!(data_flags & (PARTITION_POLICY_SIGNED|PARTITION_POLICY_VERITY)))
+ return _PARTITION_POLICY_FLAGS_INVALID;
+
+ /* If the data partition may be unused or absent, then the verity partition may too. Also, inherit the partition flags policy */
+ return partition_policy_normalized_flags(
+ &(const PartitionPolicy) {
+ .flags = PARTITION_POLICY_UNPROTECTED | (data_flags & (PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT)) |
+ (data_flags & _PARTITION_POLICY_PFLAGS_MASK),
+ .designator = designator,
+ });
+ }
+
+ data_designator = partition_verity_sig_to_data(designator);
+ if (data_designator >= 0) {
+ PartitionPolicyFlags data_flags;
+
+ /* Similar case as for verity partitions, but slightly more strict rules */
+
+ data_flags = image_policy_get(policy, data_designator);
+ if (data_flags < 0)
+ return data_flags;
+
+ if (!(data_flags & PARTITION_POLICY_SIGNED))
+ return _PARTITION_POLICY_FLAGS_INVALID;
+
+ return partition_policy_normalized_flags(
+ &(const PartitionPolicy) {
+ .flags = PARTITION_POLICY_UNPROTECTED | (data_flags & (PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT)) |
+ (data_flags & _PARTITION_POLICY_PFLAGS_MASK),
+ .designator = designator,
+ });
+ }
+
+ return _PARTITION_POLICY_FLAGS_INVALID; /* got nothing */
+}
+
+PartitionPolicyFlags image_policy_get_exhaustively(const ImagePolicy *policy, PartitionDesignator designator) {
+ PartitionPolicyFlags flags;
+
+ /* This is just like image_policy_get() but whenever there is no policy for a specific designator, we
+ * return the default policy. */
+
+ flags = image_policy_get(policy, designator);
+ if (flags < 0)
+ return partition_policy_normalized_flags(
+ &(const PartitionPolicy) {
+ .flags = image_policy_default(policy),
+ .designator = designator,
+ });
+
+ return flags;
+}
+
+static PartitionPolicyFlags policy_flag_from_string_one(const char *s) {
+ assert(s);
+
+ /* This is a bitmask (i.e. not dense), hence we don't use the "string-table.h" stuff here. */
+
+ if (streq(s, "verity"))
+ return PARTITION_POLICY_VERITY;
+ if (streq(s, "signed"))
+ return PARTITION_POLICY_SIGNED;
+ if (streq(s, "encrypted"))
+ return PARTITION_POLICY_ENCRYPTED;
+ if (streq(s, "unprotected"))
+ return PARTITION_POLICY_UNPROTECTED;
+ if (streq(s, "unused"))
+ return PARTITION_POLICY_UNUSED;
+ if (streq(s, "absent"))
+ return PARTITION_POLICY_ABSENT;
+ if (streq(s, "open")) /* shortcut alias */
+ return PARTITION_POLICY_OPEN;
+ if (streq(s, "ignore")) /* ditto */
+ return PARTITION_POLICY_IGNORE;
+ if (streq(s, "read-only-on"))
+ return PARTITION_POLICY_READ_ONLY_ON;
+ if (streq(s, "read-only-off"))
+ return PARTITION_POLICY_READ_ONLY_OFF;
+ if (streq(s, "growfs-on"))
+ return PARTITION_POLICY_GROWFS_ON;
+ if (streq(s, "growfs-off"))
+ return PARTITION_POLICY_GROWFS_OFF;
+
+ return _PARTITION_POLICY_FLAGS_INVALID;
+}
+
+PartitionPolicyFlags partition_policy_flags_from_string(const char *s) {
+ PartitionPolicyFlags flags = 0;
+ int r;
+
+ assert(s);
+
+ if (empty_or_dash(s))
+ return 0;
+
+ for (;;) {
+ _cleanup_free_ char *f = NULL;
+ PartitionPolicyFlags ff;
+
+ r = extract_first_word(&s, &f, "+", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ ff = policy_flag_from_string_one(strstrip(f));
+ if (ff < 0)
+ return -EBADRQC; /* recognizable error */
+
+ flags |= ff;
+ }
+
+ return flags;
+}
+
+static ImagePolicy* image_policy_new(size_t n_policies) {
+ ImagePolicy *p;
+
+ if (n_policies > (SIZE_MAX - offsetof(ImagePolicy, policies)) / sizeof(PartitionPolicy)) /* overflow check */
+ return NULL;
+
+ p = malloc(offsetof(ImagePolicy, policies) + sizeof(PartitionPolicy) * n_policies);
+ if (!p)
+ return NULL;
+
+ *p = (ImagePolicy) {
+ .default_flags = PARTITION_POLICY_IGNORE,
+ };
+ return p;
+}
+
+int image_policy_from_string(const char *s, ImagePolicy **ret) {
+ _cleanup_free_ ImagePolicy *p = NULL;
+ uint64_t dmask = 0;
+ ImagePolicy *t;
+ PartitionPolicyFlags symbolic_policy;
+ int r;
+
+ assert(s);
+ assert_cc(sizeof(dmask) * 8 >= _PARTITION_DESIGNATOR_MAX);
+
+ /* Recognizable errors:
+ *
+ * ENOTUNIQ → Two or more rules for the same partition
+ * EBADSLT → Unknown partition designator
+ * EBADRQC → Unknown policy flags
+ */
+
+ /* First, let's handle "symbolic" policies, i.e. "-", "*", "~" */
+ if (empty_or_dash(s))
+ /* ignore policy: everything may exist, but nothing used */
+ symbolic_policy = PARTITION_POLICY_IGNORE;
+ else if (streq(s, "*"))
+ /* allow policy: everything is allowed */
+ symbolic_policy = PARTITION_POLICY_OPEN;
+ else if (streq(s, "~"))
+ /* deny policy: nothing may exist */
+ symbolic_policy = PARTITION_POLICY_ABSENT;
+ else
+ symbolic_policy = _PARTITION_POLICY_FLAGS_INVALID;
+
+ if (symbolic_policy >= 0) {
+ if (!ret)
+ return 0;
+
+ p = image_policy_new(0);
+ if (!p)
+ return -ENOMEM;
+
+ p->default_flags = symbolic_policy;
+ *ret = TAKE_PTR(p);
+ return 0;
+ }
+
+ /* Allocate the policy at maximum size, i.e. for all designators. We might overshoot a bit, but the
+ * items are cheap, and we can return unused space to libc once we know we don't need it */
+ p = image_policy_new(_PARTITION_DESIGNATOR_MAX);
+ if (!p)
+ return -ENOMEM;
+
+ const char *q = s;
+ bool default_specified = false;
+ for (;;) {
+ _cleanup_free_ char *e = NULL, *d = NULL;
+ PartitionDesignator designator;
+ PartitionPolicyFlags flags;
+ char *f, *ds, *fs;
+
+ r = extract_first_word(&q, &e, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ f = e;
+ r = extract_first_word((const char**) &f, &d, "=", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Expected designator name followed by '='; got instead: %s", e);
+ if (!f) /* no separator? */
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Missing '=' in policy expression: %s", e);
+
+ ds = strstrip(d);
+ if (isempty(ds)) {
+ /* Not partition name? then it's the default policy */
+ if (default_specified)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Default partition policy flags specified more than once.");
+
+ designator = _PARTITION_DESIGNATOR_INVALID;
+ default_specified = true;
+ } else {
+ designator = partition_designator_from_string(ds);
+ if (designator < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT), "Unknown partition designator: %s", ds); /* recognizable error */
+ if (dmask & (UINT64_C(1) << designator))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "Partition designator specified more than once: %s", ds);
+ dmask |= UINT64_C(1) << designator;
+ }
+
+ fs = strstrip(f);
+ flags = partition_policy_flags_from_string(fs);
+ if (flags == -EBADRQC)
+ return log_debug_errno(flags, "Unknown partition policy flag: %s", fs);
+ if (flags < 0)
+ return log_debug_errno(flags, "Failed to parse partition policy flags '%s': %m", fs);
+
+ if (designator < 0)
+ p->default_flags = flags;
+ else {
+ p->policies[p->n_policies++] = (PartitionPolicy) {
+ .designator = designator,
+ .flags = flags,
+ };
+ }
+ };
+
+ assert(p->n_policies <= _PARTITION_DESIGNATOR_MAX);
+
+ /* Return unused space to libc */
+ t = realloc(p, offsetof(ImagePolicy, policies) + sizeof(PartitionPolicy) * p->n_policies);
+ if (t)
+ p = t;
+
+ typesafe_qsort(p->policies, p->n_policies, partition_policy_compare);
+
+ if (ret)
+ *ret = TAKE_PTR(p);
+
+ return 0;
+}
+
+int partition_policy_flags_to_string(PartitionPolicyFlags flags, bool simplify, char **ret) {
+ _cleanup_free_ char *buf = NULL;
+ const char *l[CONST_LOG2U(_PARTITION_POLICY_MASK) + 1]; /* one string per known flag at most */
+ size_t m = 0;
+
+ assert(ret);
+
+ if (flags < 0)
+ return -EINVAL;
+
+ /* If 'simplify' is false we'll output the precise value of every single flag.
+ *
+ * If 'simplify' is true we'll try to make the output shorter, by doing the following:
+ *
+ * → we'll spell the long form "verity+signed+encrypted+unprotected+unused+absent" via its
+ * equivalent shortcut form "open" (which we happily parse btw, see above)
+ *
+ * → we'll spell the long form "unused+absent" via its shortcut "ignore" (which we are also happy
+ * to parse)
+ *
+ * → if the read-only/growfs policy flags are both set, we suppress them. this thus removes the
+ * distinction between "user explicitly declared don't care" and "we implied don't care because
+ * user didn't say anything".
+ *
+ * net result: the resulting string is shorter, but the effective policy declared that way will have
+ * the same results as the long form. */
+
+ if (simplify && (flags & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_OPEN)
+ l[m++] = "open";
+ else if (simplify && (flags & _PARTITION_POLICY_USE_MASK) == PARTITION_POLICY_IGNORE)
+ l[m++] = "ignore";
+ else {
+ if (flags & PARTITION_POLICY_VERITY)
+ l[m++] = "verity";
+ if (flags & PARTITION_POLICY_SIGNED)
+ l[m++] = "signed";
+ if (flags & PARTITION_POLICY_ENCRYPTED)
+ l[m++] = "encrypted";
+ if (flags & PARTITION_POLICY_UNPROTECTED)
+ l[m++] = "unprotected";
+ if (flags & PARTITION_POLICY_UNUSED)
+ l[m++] = "unused";
+ if (flags & PARTITION_POLICY_ABSENT)
+ l[m++] = "absent";
+ }
+
+ if (!simplify || (!(flags & PARTITION_POLICY_READ_ONLY_ON) != !(flags & PARTITION_POLICY_READ_ONLY_OFF))) {
+ if (flags & PARTITION_POLICY_READ_ONLY_ON)
+ l[m++] = "read-only-on";
+ if (flags & PARTITION_POLICY_READ_ONLY_OFF)
+ l[m++] = "read-only-off";
+ }
+
+ if (!simplify || (!(flags & PARTITION_POLICY_GROWFS_ON) != !(flags & PARTITION_POLICY_GROWFS_OFF))) {
+ if (flags & PARTITION_POLICY_GROWFS_OFF)
+ l[m++] = "growfs-off";
+ if (flags & PARTITION_POLICY_GROWFS_ON)
+ l[m++] = "growfs-on";
+ }
+
+ if (m == 0)
+ buf = strdup("-");
+ else {
+ assert(m+1 < ELEMENTSOF(l));
+ l[m] = NULL;
+
+ buf = strv_join((char**) l, "+");
+ }
+ if (!buf)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(buf);
+ return 0;
+}
+
+static bool partition_policy_flags_extended_equal(PartitionPolicyFlags a, PartitionPolicyFlags b) {
+ return partition_policy_flags_extend(a) == partition_policy_flags_extend(b);
+}
+
+static int image_policy_flags_all_match(const ImagePolicy *policy, PartitionPolicyFlags expected) {
+
+ if (expected < 0)
+ return -EINVAL;
+
+ if (!partition_policy_flags_extended_equal(image_policy_default(policy), expected))
+ return false;
+
+ for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
+ PartitionPolicyFlags f, w;
+
+ f = image_policy_get_exhaustively(policy, d);
+ if (f < 0)
+ return f;
+
+ w = partition_policy_normalized_flags(
+ &(const PartitionPolicy) {
+ .flags = expected,
+ .designator = d,
+ });
+ if (w < 0)
+ return w;
+ if (f != w)
+ return false;
+ }
+
+ return true;
+}
+
+bool image_policy_equiv_ignore(const ImagePolicy *policy) {
+ /* Checks if this is the ignore policy (or equivalent to it), i.e. everything is ignored, aka '-', aka '' */
+ return image_policy_flags_all_match(policy, PARTITION_POLICY_IGNORE);
+}
+
+bool image_policy_equiv_allow(const ImagePolicy *policy) {
+ /* Checks if this is the allow policy (or equivalent to it), i.e. everything is allowed, aka '*' */
+ return image_policy_flags_all_match(policy, PARTITION_POLICY_OPEN);
+}
+
+bool image_policy_equiv_deny(const ImagePolicy *policy) {
+ /* Checks if this is the deny policy (or equivalent to it), i.e. everything must be absent, aka '~' */
+ return image_policy_flags_all_match(policy, PARTITION_POLICY_ABSENT);
+}
+
+int image_policy_to_string(const ImagePolicy *policy, bool simplify, char **ret) {
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ assert(ret);
+
+ if (simplify) {
+ const char *fixed;
+
+ if (image_policy_equiv_allow(policy))
+ fixed = "*";
+ else if (image_policy_equiv_ignore(policy))
+ fixed = "-";
+ else if (image_policy_equiv_deny(policy))
+ fixed = "~";
+ else
+ fixed = NULL;
+
+ if (fixed) {
+ s = strdup(fixed);
+ if (!s)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(s);
+ return 0;
+ }
+ }
+
+ for (size_t i = 0; i < image_policy_n_entries(policy); i++) {
+ const PartitionPolicy *p = policy->policies + i;
+ _cleanup_free_ char *f = NULL;
+ const char *t;
+
+ assert(i == 0 || p->designator > policy->policies[i-1].designator); /* Validate perfect ordering */
+
+ assert_se(t = partition_designator_to_string(p->designator));
+
+ if (simplify) {
+ /* Skip policy entries that match the default anyway */
+ PartitionPolicyFlags df;
+
+ df = partition_policy_normalized_flags(
+ &(const PartitionPolicy) {
+ .flags = image_policy_default(policy),
+ .designator = p->designator,
+ });
+ if (df < 0)
+ return df;
+
+ if (df == p->flags)
+ continue;
+ }
+
+ r = partition_policy_flags_to_string(p->flags, simplify, &f);
+ if (r < 0)
+ return r;
+
+ if (!strextend(&s, isempty(s) ? "" : ":", t, "=", f))
+ return -ENOMEM;
+ }
+
+ if (!simplify || !partition_policy_flags_extended_equal(image_policy_default(policy), PARTITION_POLICY_IGNORE)) {
+ _cleanup_free_ char *df = NULL;
+
+ r = partition_policy_flags_to_string(image_policy_default(policy), simplify, &df);
+ if (r < 0)
+ return r;
+
+ if (!strextend(&s, isempty(s) ? "" : ":", "=", df))
+ return -ENOMEM;
+ }
+
+ if (isempty(s)) { /* no rule and default policy? then let's return "-" */
+ s = strdup("-");
+ if (!s)
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+bool image_policy_equal(const ImagePolicy *a, const ImagePolicy *b) {
+ if (a == b)
+ return true;
+ if (image_policy_n_entries(a) != image_policy_n_entries(b))
+ return false;
+ if (image_policy_default(a) != image_policy_default(b))
+ return false;
+ for (size_t i = 0; i < image_policy_n_entries(a); i++) {
+ if (a->policies[i].designator != b->policies[i].designator)
+ return false;
+ if (a->policies[i].flags != b->policies[i].flags)
+ return false;
+ }
+
+ return true;
+}
+
+int image_policy_equivalent(const ImagePolicy *a, const ImagePolicy *b) {
+
+ /* The image_policy_equal() function checks if the policy is defined the exact same way. This
+ * function here instead looks at the outcome of the two policies instead. Where does this come to
+ * different results you ask? We imply some logic regarding Verity/Encryption: when no rule is
+ * defined for a verity partition we can synthesize it from the protection level of the data
+ * partition it protects. Or: any per-partition rule that is identical to the default rule is
+ * redundant, and will be recognized as such by image_policy_equivalent() but not by
+ * image_policy_equal()- */
+
+ if (!partition_policy_flags_extended_equal(image_policy_default(a), image_policy_default(b)))
+ return false;
+
+ for (PartitionDesignator d = 0; d < _PARTITION_DESIGNATOR_MAX; d++) {
+ PartitionPolicyFlags f, w;
+
+ f = image_policy_get_exhaustively(a, d);
+ if (f < 0)
+ return f;
+
+ w = image_policy_get_exhaustively(b, d);
+ if (w < 0)
+ return w;
+
+ if (f != w)
+ return false;
+ }
+
+ return true;
+}
+
+int config_parse_image_policy(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(image_policy_freep) ImagePolicy *np = NULL;
+ ImagePolicy **p = ASSERT_PTR(data);
+ int r;
+
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *p = image_policy_free(*p);
+ return 0;
+ }
+
+ r = image_policy_from_string(rvalue, &np);
+ if (r == -ENOTUNIQ)
+ return log_syntax(unit, LOG_ERR, filename, line, r, "Duplicate rule in image policy, refusing: %s", rvalue);
+ if (r == -EBADSLT)
+ return log_syntax(unit, LOG_ERR, filename, line, r, "Unknown partition type in image policy, refusing: %s", rvalue);
+ if (r == -EBADRQC)
+ return log_syntax(unit, LOG_ERR, filename, line, r, "Unknown partition policy flag in image policy, refusing: %s", rvalue);
+ if (r < 0)
+ return log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse image policy, refusing: %s", rvalue);
+
+ return free_and_replace_full(*p, np, image_policy_free);
+}
+
+int parse_image_policy_argument(const char *s, ImagePolicy **policy) {
+ _cleanup_(image_policy_freep) ImagePolicy *np = NULL;
+ int r;
+
+ assert(s);
+ assert(policy);
+
+ /*
+ * This function is intended to be used in command line parsers.
+ *
+ * NOTE THAT THIS WILL FREE THE PREVIOUS ARGUMENT POINTER ON SUCCESS!
+ * Hence, do not pass in uninitialized pointers.
+ */
+
+ r = image_policy_from_string(s, &np);
+ if (r == -ENOTUNIQ)
+ return log_error_errno(r, "Duplicate rule in image policy: %s", s);
+ if (r == -EBADSLT)
+ return log_error_errno(r, "Unknown partition type in image policy: %s", s);
+ if (r == -EBADRQC)
+ return log_error_errno(r, "Unknown partition policy flag in image policy: %s", s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse image policy: %s", s);
+
+ return free_and_replace_full(*policy, np, image_policy_free);
+}
+
+const ImagePolicy image_policy_allow = {
+ /* Allow policy */
+ .n_policies = 0,
+ .default_flags = PARTITION_POLICY_OPEN,
+};
+
+const ImagePolicy image_policy_deny = {
+ /* Deny policy */
+ .n_policies = 0,
+ .default_flags = PARTITION_POLICY_ABSENT,
+};
+
+const ImagePolicy image_policy_ignore = {
+ /* Ignore policy */
+ .n_policies = 0,
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+const ImagePolicy image_policy_sysext = {
+ /* For system extensions, honour root file system, and /usr/ and ignore everything else. After all,
+ * we are only interested in /usr/ + /opt/ trees anyway, and that's really the only place they can
+ * be. */
+ .n_policies = 2,
+ .policies = {
+ { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ },
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+const ImagePolicy image_policy_sysext_strict = {
+ /* For system extensions, requiring signing */
+ .n_policies = 2,
+ .policies = {
+ { PARTITION_ROOT, PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
+ { PARTITION_USR, PARTITION_POLICY_SIGNED|PARTITION_POLICY_ABSENT },
+ },
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+const ImagePolicy image_policy_confext = {
+ /* For configuration extensions, honour root file system, and ignore everything else. After all, we
+ * are only interested in the /etc/ tree anyway, and that's really the only place it can be. */
+ .n_policies = 1,
+ .policies = {
+ { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ },
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+const ImagePolicy image_policy_container = {
+ /* For systemd-nspawn containers we use all partitions, with the exception of swap */
+ .n_policies = 8,
+ .policies = {
+ { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_HOME, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_SRV, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_ESP, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_XBOOTLDR, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_TMP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_VAR, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ },
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+const ImagePolicy image_policy_host = {
+ /* For the host policy we basically use everything */
+ .n_policies = 9,
+ .policies = {
+ { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_HOME, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_SRV, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_ESP, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_XBOOTLDR, PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_SWAP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_TMP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_VAR, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ },
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
+
+const ImagePolicy image_policy_service = {
+ /* For RootImage= in services we skip ESP/XBOOTLDR and swap */
+ .n_policies = 6,
+ .policies = {
+ { PARTITION_ROOT, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_USR, PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_HOME, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_SRV, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_TMP, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ { PARTITION_VAR, PARTITION_POLICY_ENCRYPTED|PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_ABSENT },
+ },
+ .default_flags = PARTITION_POLICY_IGNORE,
+};
diff --git a/src/shared/image-policy.h b/src/shared/image-policy.h
new file mode 100644
index 0000000..f59c16e
--- /dev/null
+++ b/src/shared/image-policy.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef struct ImagePolicy ImagePolicy;
+
+#include "conf-parser.h"
+#include "dissect-image.h"
+#include "errno-list.h"
+
+typedef enum PartitionPolicyFlags {
+ /* Not all policy flags really make sense on all partition types, see comments. But even if they
+ * don't make sense we'll parse them anyway, because maybe one day we'll add them for more partition
+ * types, too. Moreover, we allow configuring a "default" policy for all partition types for which no
+ * explicit policy is specified. It's useful if we can use policy flags in there and apply this
+ * default policy gracefully even to partition types where they don't really make too much sense
+ * on. Example: a default policy of "verity+encrypted" certainly makes sense, but for /home/
+ * partitions this gracefully degrades to "encrypted" (as we do not have a concept of verity for
+ * /home/), and so on. */
+ PARTITION_POLICY_VERITY = 1 << 0, /* must exist, activate with verity (only applies to root/usr partitions) */
+ PARTITION_POLICY_SIGNED = 1 << 1, /* must exist, activate with signed verity (only applies to root/usr partitions) */
+ PARTITION_POLICY_ENCRYPTED = 1 << 2, /* must exist, activate with LUKS encryption (applies to any data partition, but not to verity/signature partitions */
+ PARTITION_POLICY_UNPROTECTED = 1 << 3, /* must exist, activate without encryption/verity */
+ PARTITION_POLICY_UNUSED = 1 << 4, /* must exist, don't use */
+ PARTITION_POLICY_ABSENT = 1 << 5, /* must not exist */
+ PARTITION_POLICY_OPEN = PARTITION_POLICY_VERITY|PARTITION_POLICY_SIGNED|PARTITION_POLICY_ENCRYPTED|
+ PARTITION_POLICY_UNPROTECTED|PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT,
+ PARTITION_POLICY_IGNORE = PARTITION_POLICY_UNUSED|PARTITION_POLICY_ABSENT,
+ _PARTITION_POLICY_USE_MASK = PARTITION_POLICY_OPEN,
+
+ PARTITION_POLICY_READ_ONLY_OFF = 1 << 6, /* State of GPT partition flag "read-only" must be on */
+ PARTITION_POLICY_READ_ONLY_ON = 1 << 7,
+ _PARTITION_POLICY_READ_ONLY_MASK = PARTITION_POLICY_READ_ONLY_OFF|PARTITION_POLICY_READ_ONLY_ON,
+ PARTITION_POLICY_GROWFS_OFF = 1 << 8, /* State of GPT partition flag "growfs" must be on */
+ PARTITION_POLICY_GROWFS_ON = 1 << 9,
+ _PARTITION_POLICY_GROWFS_MASK = PARTITION_POLICY_GROWFS_OFF|PARTITION_POLICY_GROWFS_ON,
+ _PARTITION_POLICY_PFLAGS_MASK = _PARTITION_POLICY_READ_ONLY_MASK|_PARTITION_POLICY_GROWFS_MASK,
+
+ _PARTITION_POLICY_MASK = _PARTITION_POLICY_USE_MASK|_PARTITION_POLICY_READ_ONLY_MASK|_PARTITION_POLICY_GROWFS_MASK,
+
+ _PARTITION_POLICY_FLAGS_INVALID = -EINVAL,
+ _PARTITION_POLICY_FLAGS_ERRNO_MAX = -ERRNO_MAX, /* Ensure the whole errno range fits into this enum */
+} PartitionPolicyFlags;
+
+assert_cc((_PARTITION_POLICY_USE_MASK | _PARTITION_POLICY_PFLAGS_MASK) >= 0); /* ensure flags don't collide with errno range */
+
+typedef struct PartitionPolicy {
+ PartitionDesignator designator;
+ PartitionPolicyFlags flags;
+} PartitionPolicy;
+
+struct ImagePolicy {
+ PartitionPolicyFlags default_flags; /* for any designator not listed in the list below */
+ size_t n_policies;
+ PartitionPolicy policies[]; /* sorted by designator, hence suitable for binary search */
+};
+
+/* Default policies for various use cases */
+extern const ImagePolicy image_policy_allow;
+extern const ImagePolicy image_policy_deny;
+extern const ImagePolicy image_policy_ignore;
+extern const ImagePolicy image_policy_sysext; /* No verity required */
+extern const ImagePolicy image_policy_sysext_strict; /* Signed verity required */
+extern const ImagePolicy image_policy_confext; /* No verity required */
+extern const ImagePolicy image_policy_container;
+extern const ImagePolicy image_policy_service;
+extern const ImagePolicy image_policy_host;
+
+PartitionPolicyFlags image_policy_get(const ImagePolicy *policy, PartitionDesignator designator);
+PartitionPolicyFlags image_policy_get_exhaustively(const ImagePolicy *policy, PartitionDesignator designator);
+
+/* We want that the NULL image policy means "everything" allowed, hence use these simple accessors to make
+ * NULL policies work reasonably */
+static inline PartitionPolicyFlags image_policy_default(const ImagePolicy *policy) {
+ return policy ? policy->default_flags : PARTITION_POLICY_OPEN;
+}
+
+static inline size_t image_policy_n_entries(const ImagePolicy *policy) {
+ return policy ? policy->n_policies : 0;
+}
+
+PartitionPolicyFlags partition_policy_flags_extend(PartitionPolicyFlags flags);
+
+PartitionPolicyFlags partition_policy_flags_from_string(const char *s);
+int partition_policy_flags_to_string(PartitionPolicyFlags flags, bool simplify, char **ret);
+
+int image_policy_from_string(const char *s, ImagePolicy **ret);
+int image_policy_to_string(const ImagePolicy *policy, bool simplify, char **ret);
+
+/* Recognizes three special policies by equivalence */
+bool image_policy_equiv_ignore(const ImagePolicy *policy);
+bool image_policy_equiv_allow(const ImagePolicy *policy);
+bool image_policy_equiv_deny(const ImagePolicy *policy);
+
+bool image_policy_equal(const ImagePolicy *a, const ImagePolicy *b); /* checks if defined the same way, i.e. has literally the same ruleset */
+int image_policy_equivalent(const ImagePolicy *a, const ImagePolicy *b); /* checks if the outcome is the same, i.e. for all partitions results in the same decisions. */
+
+static inline ImagePolicy* image_policy_free(ImagePolicy *p) {
+ return mfree(p);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(ImagePolicy*, image_policy_free);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_image_policy);
+int parse_image_policy_argument(const char *s, ImagePolicy **policy);
diff --git a/src/shared/import-util.c b/src/shared/import-util.c
new file mode 100644
index 0000000..9057b78
--- /dev/null
+++ b/src/shared/import-util.c
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "chattr-util.h"
+#include "errno-util.h"
+#include "import-util.h"
+#include "log.h"
+#include "macro.h"
+#include "nulstr-util.h"
+#include "path-util.h"
+#include "string-table.h"
+#include "string-util.h"
+
+static const char *skip_protocol_and_hostname(const char *url) {
+ const char *d;
+ size_t n;
+
+ /* A very very lenient implementation of RFC3986 Section 3.2 */
+
+ /* Find colon separating protocol and hostname */
+ d = strchr(url, ':');
+ if (!d || url == d)
+ return NULL;
+ d++;
+
+ /* Skip slashes after colon */
+ d += strspn(d, "/");
+
+ /* Skip everything till next slash or end */
+ n = strcspn(d, "/?#");
+ if (n == 0)
+ return NULL;
+
+ return d + n;
+}
+
+int import_url_last_component(
+ const char *url,
+ char **ret) {
+
+ const char *e, *p, *h;
+
+ /* This extracts the last path component of the specified URI, i.e. the last non-empty substrings
+ * between two "/" characters. This ignores "Query" and "Fragment" suffixes (as per RFC3986). */
+
+ h = skip_protocol_and_hostname(url);
+ if (!h)
+ return -EINVAL;
+
+ e = h + strcspn(h, "?#"); /* Cut off "Query" and "Fragment" */
+
+ while (e > h && e[-1] == '/') /* Eat trailing slashes */
+ e--;
+
+ p = e;
+ while (p > h && p[-1] != '/') /* Find component before that */
+ p--;
+
+ if (e <= p) /* Empty component? */
+ return -EADDRNOTAVAIL;
+
+ if (ret) {
+ char *s;
+
+ s = strndup(p, e - p);
+ if (!s)
+ return -ENOMEM;
+
+ *ret = s;
+ }
+
+ return 0;
+}
+
+int import_url_change_suffix(
+ const char *url,
+ size_t n_drop_components,
+ const char *suffix,
+ char **ret) {
+
+ const char *e, *h;
+ char *s;
+
+ assert(url);
+ assert(ret);
+
+ /* This drops the specified number of path components of the specified URI, i.e. the specified number
+ * of non-empty substring between two "/" characters from the end of the string, and then append the
+ * specified suffix instead. Before doing all this it chops off the "Query" and "Fragment" suffixes
+ * (they are *not* re-added to the final URL). Note that n_drop_components may be 0 (in which case the
+ * component are simply added to the end). The suffix may be specified as NULL or empty string in
+ * which case nothing is appended, only the specified number of components chopped off. Note that the
+ * function may be called with n_drop_components == 0 and suffix == NULL, in which case the "Query"
+ * and "Fragment" is chopped off, and ensured the URL ends in a single "/", and that's it. */
+
+ h = skip_protocol_and_hostname(url);
+ if (!h)
+ return -EINVAL;
+
+ e = h + strcspn(h, "?#"); /* Cut off "Query" and "Fragment" */
+
+ while (e > h && e[-1] == '/') /* Eat trailing slashes */
+ e--;
+
+ /* Drop the specified number of components from the end. Note that this is pretty lenient: if there
+ * are less component we silently drop those and then append the suffix to the top. */
+ while (n_drop_components > 0) {
+ while (e > h && e[-1] != '/') /* Eat last word (we don't mind if empty) */
+ e--;
+
+ while (e > h && e[-1] == '/') /* Eat slashes before the last word */
+ e--;
+
+ n_drop_components--;
+ }
+
+ s = new(char, (e - url) + 1 + strlen_ptr(suffix) + 1);
+ if (!s)
+ return -ENOMEM;
+
+ strcpy(stpcpy(mempcpy(s, url, e - url), "/"), strempty(suffix));
+ *ret = s;
+ return 0;
+}
+
+static const char* const import_verify_table[_IMPORT_VERIFY_MAX] = {
+ [IMPORT_VERIFY_NO] = "no",
+ [IMPORT_VERIFY_CHECKSUM] = "checksum",
+ [IMPORT_VERIFY_SIGNATURE] = "signature",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(import_verify, ImportVerify);
+
+int tar_strip_suffixes(const char *name, char **ret) {
+ const char *e;
+ char *s;
+
+ e = endswith(name, ".tar");
+ if (!e)
+ e = endswith(name, ".tar.xz");
+ if (!e)
+ e = endswith(name, ".tar.gz");
+ if (!e)
+ e = endswith(name, ".tar.bz2");
+ if (!e)
+ e = endswith(name, ".tgz");
+ if (!e)
+ e = strchr(name, 0);
+
+ if (e <= name)
+ return -EINVAL;
+
+ s = strndup(name, e - name);
+ if (!s)
+ return -ENOMEM;
+
+ *ret = s;
+ return 0;
+}
+
+int raw_strip_suffixes(const char *p, char **ret) {
+
+ static const char suffixes[] =
+ ".xz\0"
+ ".gz\0"
+ ".bz2\0"
+ ".sysext.raw\0"
+ ".confext.raw\0"
+ ".raw\0"
+ ".qcow2\0"
+ ".img\0"
+ ".bin\0";
+
+ _cleanup_free_ char *q = NULL;
+
+ q = strdup(p);
+ if (!q)
+ return -ENOMEM;
+
+ for (;;) {
+ bool changed = false;
+
+ NULSTR_FOREACH(sfx, suffixes) {
+ char *e;
+
+ e = endswith(q, sfx);
+ if (e) {
+ *e = 0;
+ changed = true;
+ }
+ }
+
+ if (!changed)
+ break;
+ }
+
+ *ret = TAKE_PTR(q);
+
+ return 0;
+}
+
+int import_assign_pool_quota_and_warn(const char *path) {
+ int r;
+
+ assert(path);
+
+ r = btrfs_subvol_auto_qgroup(path, 0, true);
+ if (r == -ENOTTY) {
+ log_debug_errno(r, "Failed to set up quota hierarchy for %s, as directory is not on btrfs or not a subvolume. Ignoring.", path);
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up default quota hierarchy for %s: %m", path);
+ if (r > 0)
+ log_debug("Set up default quota hierarchy for %s.", path);
+
+ return 0;
+}
+
+int import_set_nocow_and_log(int fd, const char *path) {
+ int r;
+
+ r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
+ if (r < 0)
+ return log_full_errno(
+ ERRNO_IS_NOT_SUPPORTED(r) ? LOG_DEBUG : LOG_WARNING,
+ r, "Failed to set file attributes on %s: %m", path);
+
+ return 0;
+}
diff --git a/src/shared/import-util.h b/src/shared/import-util.h
new file mode 100644
index 0000000..3b2425b
--- /dev/null
+++ b/src/shared/import-util.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+typedef enum ImportVerify {
+ IMPORT_VERIFY_NO,
+ IMPORT_VERIFY_CHECKSUM,
+ IMPORT_VERIFY_SIGNATURE,
+ _IMPORT_VERIFY_MAX,
+ _IMPORT_VERIFY_INVALID = -EINVAL,
+} ImportVerify;
+
+int import_url_last_component(const char *url, char **ret);
+
+int import_url_change_suffix(const char *url, size_t n_drop_components, const char *suffix, char **ret);
+
+static inline int import_url_change_last_component(const char *url, const char *suffix, char **ret) {
+ return import_url_change_suffix(url, 1, suffix, ret);
+}
+
+static inline int import_url_append_component(const char *url, const char *suffix, char **ret) {
+ return import_url_change_suffix(url, 0, suffix, ret);
+}
+
+const char* import_verify_to_string(ImportVerify v) _const_;
+ImportVerify import_verify_from_string(const char *s) _pure_;
+
+int tar_strip_suffixes(const char *name, char **ret);
+int raw_strip_suffixes(const char *name, char **ret);
+
+int import_assign_pool_quota_and_warn(const char *path);
+
+int import_set_nocow_and_log(int fd, const char *path);
diff --git a/src/shared/in-addr-prefix-util.c b/src/shared/in-addr-prefix-util.c
new file mode 100644
index 0000000..7c0033d
--- /dev/null
+++ b/src/shared/in-addr-prefix-util.c
@@ -0,0 +1,325 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "hostname-util.h"
+#include "in-addr-prefix-util.h"
+#include "string-util.h"
+
+/* 0.0.0.0/0 */
+#define IN_ADDR_PREFIX_IPV4_ANY ((struct in_addr_prefix) { .family = AF_INET })
+/* ::/0 */
+#define IN_ADDR_PREFIX_IPV6_ANY ((struct in_addr_prefix) { .family = AF_INET6 })
+/* 127.0.0.0/8 */
+#define IN_ADDR_PREFIX_IPV4_LOCALHOST \
+ ((struct in_addr_prefix) { \
+ .family = AF_INET, \
+ .address.in.s_addr = htobe32(UINT32_C(127) << 24), \
+ .prefixlen = 8, \
+ })
+/* ::1/128 */
+#define IN_ADDR_PREFIX_IPV6_LOCALHOST \
+ ((struct in_addr_prefix) { \
+ .family = AF_INET6, \
+ .address.in6 = IN6ADDR_LOOPBACK_INIT, \
+ .prefixlen = 128, \
+ })
+/* 169.254.0.0/16 */
+#define IN_ADDR_PREFIX_IPV4_LINKLOCAL \
+ ((struct in_addr_prefix) { \
+ .family = AF_INET, \
+ .address.in.s_addr = htobe32((UINT32_C(169) << 24) | \
+ (UINT32_C(254) << 16)), \
+ .prefixlen = 16, \
+ })
+/* fe80::/64 */
+#define IN_ADDR_PREFIX_IPV6_LINKLOCAL \
+ ((struct in_addr_prefix) { \
+ .family = AF_INET6, \
+ .address.in6.s6_addr[0] = 0xfe, \
+ .address.in6.s6_addr[1] = 0x80, \
+ .prefixlen = 64, \
+ })
+/* 224.0.0.0/4 */
+#define IN_ADDR_PREFIX_IPV4_MULTICAST \
+ ((struct in_addr_prefix) { \
+ .family = AF_INET, \
+ .address.in.s_addr = htobe32((UINT32_C(224) << 24)), \
+ .prefixlen = 4, \
+ })
+/* ff00::/8 */
+#define IN_ADDR_PREFIX_IPV6_MULTICAST \
+ ((struct in_addr_prefix) { \
+ .family = AF_INET6, \
+ .address.in6.s6_addr[0] = 0xff, \
+ .prefixlen = 8, \
+ })
+
+static void in_addr_prefix_hash_func(const struct in_addr_prefix *a, struct siphash *state) {
+ assert(a);
+ assert(state);
+
+ siphash24_compress(&a->family, sizeof(a->family), state);
+ siphash24_compress(&a->prefixlen, sizeof(a->prefixlen), state);
+ siphash24_compress(&a->address, FAMILY_ADDRESS_SIZE(a->family), state);
+}
+
+static int in_addr_prefix_compare_func(const struct in_addr_prefix *x, const struct in_addr_prefix *y) {
+ int r;
+
+ assert(x);
+ assert(y);
+
+ r = CMP(x->family, y->family);
+ if (r != 0)
+ return r;
+
+ r = CMP(x->prefixlen, y->prefixlen);
+ if (r != 0)
+ return r;
+
+ return memcmp(&x->address, &y->address, FAMILY_ADDRESS_SIZE(x->family));
+}
+
+DEFINE_HASH_OPS(in_addr_prefix_hash_ops, struct in_addr_prefix, in_addr_prefix_hash_func, in_addr_prefix_compare_func);
+DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(in_addr_prefix_hash_ops_free, struct in_addr_prefix, in_addr_prefix_hash_func, in_addr_prefix_compare_func, free);
+
+int in_addr_prefix_add(Set **prefixes, const struct in_addr_prefix *prefix) {
+ struct in_addr_prefix *copy;
+
+ assert(prefixes);
+ assert(prefix);
+ assert(IN_SET(prefix->family, AF_INET, AF_INET6));
+
+ copy = newdup(struct in_addr_prefix, prefix, 1);
+ if (!copy)
+ return -ENOMEM;
+
+ (void) in_addr_mask(copy->family, &copy->address, copy->prefixlen);
+ return set_ensure_consume(prefixes, &in_addr_prefix_hash_ops_free, copy);
+}
+
+int in_addr_prefixes_reduce(Set *prefixes) {
+ uint32_t ipv4_prefixlen_bits = 0;
+ uint64_t ipv6_prefixlen_bits[128 / sizeof(uint64_t)] = {};
+ uint8_t ipv4_prefixlens[32] = {}, ipv6_prefixlens[128] = {};
+ bool ipv4_has_any = false, ipv6_has_any = false;
+ size_t ipv4_n_prefixlens = 0, ipv6_n_prefixlens = 0;
+ struct in_addr_prefix *p;
+
+ SET_FOREACH(p, prefixes)
+ switch (p->family) {
+ case AF_INET:
+ assert(p->prefixlen <= 32);
+ if (p->prefixlen == 0)
+ ipv4_has_any = true;
+ else
+ ipv4_prefixlen_bits |= UINT32_C(1) << (p->prefixlen - 1);
+ break;
+ case AF_INET6:
+ assert(p->prefixlen <= 128);
+ if (p->prefixlen == 0)
+ ipv6_has_any = true;
+ else
+ ipv6_prefixlen_bits[(p->prefixlen - 1) / sizeof(uint64_t)] |=
+ UINT64_C(1) << ((p->prefixlen - 1) % sizeof(uint64_t));
+ break;
+ default:
+ assert_not_reached();
+ }
+
+ if (!ipv4_has_any)
+ for (size_t i = 0; i < 32; i++)
+ if (ipv4_prefixlen_bits & (UINT32_C(1) << i))
+ ipv4_prefixlens[ipv4_n_prefixlens++] = i + 1;
+
+ if (!ipv6_has_any)
+ for (size_t i = 0; i < 128; i++)
+ if (ipv6_prefixlen_bits[i / sizeof(uint64_t)] &
+ (UINT64_C(1) << (i % sizeof(uint64_t))))
+ ipv6_prefixlens[ipv6_n_prefixlens++] = i + 1;
+
+ SET_FOREACH(p, prefixes) {
+ uint8_t *prefixlens;
+ bool covered;
+ size_t *n;
+
+ if (p->prefixlen == 0)
+ continue;
+
+ switch (p->family) {
+ case AF_INET:
+ prefixlens = ipv4_prefixlens;
+ n = &ipv4_n_prefixlens;
+ covered = ipv4_has_any;
+ break;
+ case AF_INET6:
+ prefixlens = ipv6_prefixlens;
+ n = &ipv6_n_prefixlens;
+ covered = ipv6_has_any;
+ break;
+ default:
+ assert_not_reached();
+ }
+
+ for (size_t i = 0; i < *n; i++) {
+ struct in_addr_prefix tmp;
+
+ if (covered)
+ break;
+
+ if (prefixlens[i] >= p->prefixlen)
+ break;
+
+ tmp = *p;
+ tmp.prefixlen = prefixlens[i];
+ (void) in_addr_mask(tmp.family, &tmp.address, tmp.prefixlen);
+
+ covered = set_contains(prefixes, &tmp);
+ }
+
+ if (covered)
+ free(set_remove(prefixes, p));
+ }
+
+ return 0;
+}
+
+int in_addr_prefixes_merge(Set **dest, Set *src) {
+ struct in_addr_prefix *p;
+ int r;
+
+ assert(dest);
+
+ SET_FOREACH(p, src) {
+ r = in_addr_prefix_add(dest, p);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+bool in_addr_prefixes_is_any(Set *prefixes) {
+ return
+ set_contains(prefixes, &IN_ADDR_PREFIX_IPV4_ANY) &&
+ set_contains(prefixes, &IN_ADDR_PREFIX_IPV6_ANY);
+}
+
+int config_parse_in_addr_prefixes(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ Set **prefixes = ASSERT_PTR(data);
+ int r;
+
+ assert(IN_SET(ltype, AF_UNSPEC, AF_INET, AF_INET6));
+
+ if (isempty(rvalue)) {
+ *prefixes = set_free(*prefixes);
+ return 0;
+ }
+
+ for (const char *p = rvalue;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r == 0)
+ return 0;
+
+ if (streq(word, "any")) {
+ /* "any" is a shortcut for 0.0.0.0/0 and ::/0 */
+
+ if (ltype != AF_INET6) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_ANY);
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (ltype != AF_INET) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_ANY);
+ if (r < 0)
+ return log_oom();
+ }
+
+ } else if (is_localhost(word)) {
+ /* "localhost" is a shortcut for 127.0.0.0/8 and ::1/128 */
+
+ if (ltype != AF_INET6) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_LOCALHOST);
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (ltype != AF_INET) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_LOCALHOST);
+ if (r < 0)
+ return log_oom();
+ }
+
+ } else if (streq(word, "link-local")) {
+ /* "link-local" is a shortcut for 169.254.0.0/16 and fe80::/64 */
+
+ if (ltype != AF_INET6) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_LINKLOCAL);
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (ltype != AF_INET) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_LINKLOCAL);
+ if (r < 0)
+ return log_oom();
+ }
+
+ } else if (streq(word, "multicast")) {
+ /* "multicast" is a shortcut for 224.0.0.0/4 and ff00::/8 */
+
+ if (ltype != AF_INET6) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV4_MULTICAST);
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (ltype != AF_INET) {
+ r = in_addr_prefix_add(prefixes, &IN_ADDR_PREFIX_IPV6_MULTICAST);
+ if (r < 0)
+ return log_oom();
+ }
+
+ } else {
+ struct in_addr_prefix a;
+
+ if (ltype == AF_UNSPEC)
+ r = in_addr_prefix_from_string_auto(word, &a.family, &a.address, &a.prefixlen);
+ else {
+ a.family = ltype;
+ r = in_addr_prefix_from_string(word, a.family, &a.address, &a.prefixlen);
+ }
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid address prefix is specified in [%s] %s=, ignoring assignment: %s",
+ section, lvalue, word);
+ continue;
+ }
+
+ r = in_addr_prefix_add(prefixes, &a);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+}
diff --git a/src/shared/in-addr-prefix-util.h b/src/shared/in-addr-prefix-util.h
new file mode 100644
index 0000000..53aaad3
--- /dev/null
+++ b/src/shared/in-addr-prefix-util.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "conf-parser.h"
+#include "in-addr-util.h"
+#include "set.h"
+
+struct in_addr_prefix {
+ int family;
+ uint8_t prefixlen;
+ union in_addr_union address;
+};
+
+int in_addr_prefix_add(Set **prefixes, const struct in_addr_prefix *prefix);
+int in_addr_prefixes_reduce(Set *prefixes);
+int in_addr_prefixes_merge(Set **dest, Set *src);
+/* Returns true if a set contains the two items necessary for "any" (0.0.0.0/0 and ::/0). */
+bool in_addr_prefixes_is_any(Set *prefixes);
+
+extern const struct hash_ops in_addr_prefix_hash_ops;
+extern const struct hash_ops in_addr_prefix_hash_ops_free;
+
+CONFIG_PARSER_PROTOTYPE(config_parse_in_addr_prefixes);
diff --git a/src/shared/initreq.h b/src/shared/initreq.h
new file mode 100644
index 0000000..da9783c
--- /dev/null
+++ b/src/shared/initreq.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: LGPL-2.0-or-later */
+/*
+ * initreq.h Interface to talk to init through /dev/initctl.
+ *
+ * Copyright (C) 1995-2004 Miquel van Smoorenburg
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * Version: @(#)initreq.h 1.28 31-Mar-2004 MvS
+ */
+
+#pragma once
+
+#include <sys/param.h>
+
+#if defined(__FreeBSD_kernel__)
+# define INIT_FIFO "/etc/.initctl"
+#else
+# define INIT_FIFO "/dev/initctl"
+#endif
+
+#define INIT_MAGIC 0x03091969
+#define INIT_CMD_START 0
+#define INIT_CMD_RUNLVL 1
+#define INIT_CMD_POWERFAIL 2
+#define INIT_CMD_POWERFAILNOW 3
+#define INIT_CMD_POWEROK 4
+#define INIT_CMD_BSD 5
+#define INIT_CMD_SETENV 6
+#define INIT_CMD_UNSETENV 7
+
+#define INIT_CMD_CHANGECONS 12345
+
+#ifdef MAXHOSTNAMELEN
+# define INITRQ_HLEN MAXHOSTNAMELEN
+#else
+# define INITRQ_HLEN 64
+#endif
+
+/*
+ * This is what BSD 4.4 uses when talking to init.
+ * Linux doesn't use this right now.
+ */
+struct init_request_bsd {
+ char gen_id[8]; /* Beats me.. telnetd uses "fe" */
+ char tty_id[16]; /* Tty name minus /dev/tty */
+ char host[INITRQ_HLEN]; /* Hostname */
+ char term_type[16]; /* Terminal type */
+ int signal; /* Signal to send */
+ int pid; /* Process to send to */
+ char exec_name[128]; /* Program to execute */
+ char reserved[128]; /* For future expansion. */
+};
+
+/*
+ * Because of legacy interfaces, "runlevel" and "sleeptime"
+ * aren't in a separate struct in the union.
+ *
+ * The weird sizes are because init expects the whole
+ * struct to be 384 bytes.
+ */
+struct init_request {
+ int magic; /* Magic number */
+ int cmd; /* What kind of request */
+ int runlevel; /* Runlevel to change to */
+ int sleeptime; /* Time between TERM and KILL */
+ union {
+ struct init_request_bsd bsd;
+ char data[368];
+ } i;
+};
diff --git a/src/shared/install-file.c b/src/shared/install-file.c
new file mode 100644
index 0000000..3b4d651
--- /dev/null
+++ b/src/shared/install-file.c
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/ioctl.h>
+
+#include "btrfs-util.h"
+#include "chattr-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "install-file.h"
+#include "missing_syscall.h"
+#include "rm-rf.h"
+#include "sync-util.h"
+
+int fs_make_very_read_only(int fd) {
+ struct stat st;
+ int r;
+
+ assert(fd >= 0);
+
+ /* Tries to make the specified fd "comprehensively" read-only. Primary use case for this is OS images,
+ * i.e. either loopback files or larger directory hierarchies. Depending on the inode type and
+ * backing file system this means something different:
+ *
+ * 1. If the fd refers to a btrfs subvolume we'll mark it read-only as a whole
+ * 2. If the fd refers to any other directory we'll set the FS_IMMUTABLE_FL flag on it
+ * 3. If the fd refers to a regular file we'll drop the w bits.
+ * 4. If the fd refers to a block device, use BLKROSET to set read-only state
+ *
+ * You might wonder why not drop the x bits for directories. That's because we want to guarantee that
+ * everything "inside" the image remains largely the way it is, in case you mount it. And since the
+ * mode of the root dir of the image is pretty visible we don't want to modify it. btrfs subvol flags
+ * and the FS_IMMUTABLE_FL otoh are much less visible. Changing the mode of regular files should be
+ * OK though, since after all this is supposed to be used for disk images, i.e. the fs in the disk
+ * image doesn't make the mode of the loopback file it is stored in visible. */
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ switch (st.st_mode & S_IFMT) {
+
+ case S_IFDIR:
+ if (btrfs_might_be_subvol(&st)) {
+ r = btrfs_subvol_set_read_only_fd(fd, true);
+ if (r >= 0)
+ return 0;
+
+ if (!ERRNO_IS_NOT_SUPPORTED(r) && r != -EINVAL)
+ return r;
+ }
+
+ r = chattr_fd(fd, FS_IMMUTABLE_FL, FS_IMMUTABLE_FL, NULL);
+ if (r < 0)
+ return r;
+
+ break;
+
+ case S_IFREG:
+ if ((st.st_mode & 0222) != 0)
+ if (fchmod(fd, st.st_mode & 07555) < 0)
+ return -errno;
+
+ break;
+
+ case S_IFBLK: {
+ int ro = 1;
+
+ if (ioctl(fd, BLKROSET, &ro) < 0)
+ return -errno;
+
+ break;
+ }
+
+ default:
+ return -EBADFD;
+ }
+
+ return 0;
+}
+
+static int unlinkat_maybe_dir(int dirfd, const char *pathname) {
+
+ /* Invokes unlinkat() for regular files first, and if this fails with EISDIR tries again with
+ * AT_REMOVEDIR */
+
+ if (unlinkat(dirfd, pathname, 0) < 0) {
+ if (errno != EISDIR)
+ return -errno;
+
+ if (unlinkat(dirfd, pathname, AT_REMOVEDIR) < 0)
+ return -errno;
+ }
+
+ return 0;
+}
+
+int install_file(int source_atfd, const char *source_name,
+ int target_atfd, const char *target_name,
+ InstallFileFlags flags) {
+
+ _cleanup_close_ int rofd = -EBADF;
+ int r;
+
+ /* Moves a file or directory tree into place, with some bells and whistles:
+ *
+ * 1. Optionally syncs before/after to ensure file installation can be used as barrier
+ * 2. Optionally marks the file/directory read-only using fs_make_very_read_only()
+ * 3. Optionally operates in replacing or in non-replacing mode.
+ * 4. If it replaces will remove the old tree if needed.
+ */
+
+ assert(source_atfd >= 0 || source_atfd == AT_FDCWD);
+ assert(source_name);
+ assert(target_atfd >= 0 || target_atfd == AT_FDCWD);
+
+ /* If target_name is specified as NULL no renaming takes place. Instead it is assumed the file is
+ * already in place, and only the syncing/read-only marking shall be applied. Note that with
+ * target_name=NULL and flags=0 this call is a NOP */
+
+ if ((flags & (INSTALL_FSYNC|INSTALL_FSYNC_FULL|INSTALL_SYNCFS|INSTALL_READ_ONLY)) != 0) {
+ _cleanup_close_ int pfd = -EBADF;
+ struct stat st;
+
+ /* Open an O_PATH fd for the source if we need to sync things or mark things read only. */
+
+ pfd = openat(source_atfd, source_name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
+ if (pfd < 0)
+ return -errno;
+
+ if (fstat(pfd, &st) < 0)
+ return -errno;
+
+ switch (st.st_mode & S_IFMT) {
+
+ case S_IFREG: {
+ _cleanup_close_ int regfd = -EBADF;
+
+ regfd = fd_reopen(pfd, O_RDONLY|O_CLOEXEC);
+ if (regfd < 0)
+ return regfd;
+
+ if ((flags & (INSTALL_FSYNC_FULL|INSTALL_SYNCFS)) != 0) {
+ /* If this is just a regular file (as oppose to a fully populated directory)
+ * let's downgrade INSTALL_SYNCFS to INSTALL_FSYNC_FULL, after all this is
+ * going to be a single inode we install */
+ r = fsync_full(regfd);
+ if (r < 0)
+ return r;
+ } else if (flags & INSTALL_FSYNC) {
+ if (fsync(regfd) < 0)
+ return -errno;
+ }
+
+ if (flags & INSTALL_READ_ONLY)
+ rofd = TAKE_FD(regfd);
+
+ break;
+ }
+
+ case S_IFDIR: {
+ _cleanup_close_ int dfd = -EBADF;
+
+ dfd = fd_reopen(pfd, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (dfd < 0)
+ return dfd;
+
+ if (flags & INSTALL_SYNCFS) {
+ if (syncfs(dfd) < 0)
+ return -errno;
+ } else if (flags & INSTALL_FSYNC_FULL) {
+ r = fsync_full(dfd);
+ if (r < 0)
+ return r;
+ } else if (flags & INSTALL_FSYNC) {
+ if (fsync(dfd) < 0)
+ return -errno;
+ }
+
+ if (flags & INSTALL_READ_ONLY)
+ rofd = TAKE_FD(dfd);
+
+ break;
+ }
+
+ default:
+ /* Other inodes: char/block device inodes, fifos, symlinks, sockets don't need
+ * syncing themselves, as they only exist in the directory, and have no contents on
+ * disk */
+
+ if (target_name && (flags & (INSTALL_FSYNC_FULL|INSTALL_SYNCFS)) != 0) {
+ r = fsync_directory_of_file(pfd);
+ if (r < 0)
+ return r;
+ }
+
+ break;
+ }
+ }
+
+ if (target_name) {
+ /* Rename the file */
+
+ if (flags & INSTALL_REPLACE) {
+ /* First, try a simple renamat(), maybe that's enough */
+ if (renameat(source_atfd, source_name, target_atfd, target_name) < 0) {
+ _cleanup_close_ int dfd = -EBADF;
+
+ if (!IN_SET(errno, EEXIST, ENOTDIR, ENOTEMPTY, EISDIR, EBUSY))
+ return -errno;
+
+ /* Hmm, the target apparently existed already. Let's try to use
+ * RENAME_EXCHANGE. But let's first open the inode if it's a directory, so
+ * that we can later remove its contents if it's a directory. Why do this
+ * before the rename()? Mostly because if we have trouble opening the thing
+ * we want to know before we start actually modifying the file system. */
+
+ dfd = openat(target_atfd, target_name, O_RDONLY|O_DIRECTORY|O_CLOEXEC, 0);
+ if (dfd < 0 && errno != ENOTDIR)
+ return -errno;
+
+ if (renameat2(source_atfd, source_name, target_atfd, target_name, RENAME_EXCHANGE) < 0) {
+
+ if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
+ return -errno;
+
+ /* The exchange didn't work, let's remove the target first, and try again */
+
+ if (dfd >= 0)
+ (void) rm_rf_children(TAKE_FD(dfd), REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_CHMOD, NULL);
+
+ r = unlinkat_maybe_dir(target_atfd, target_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to remove target directory: %m");
+
+ if (renameat(source_atfd, source_name, target_atfd, target_name) < 0)
+ return -errno;
+ } else {
+ /* The exchange worked, hence let's remove the source (i.e. the old target) */
+ if (dfd >= 0)
+ (void) rm_rf_children(TAKE_FD(dfd), REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_CHMOD, NULL);
+
+ r = unlinkat_maybe_dir(source_atfd, source_name);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to remove replaced target directory: %m");
+ }
+ }
+ } else {
+ r = rename_noreplace(source_atfd, source_name, target_atfd, target_name);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ if (rofd >= 0) {
+ r = fs_make_very_read_only(rofd);
+ if (r < 0)
+ return r;
+ }
+
+ if ((flags & (INSTALL_FSYNC_FULL|INSTALL_SYNCFS)) != 0) {
+ if (target_name)
+ r = fsync_parent_at(target_atfd, target_name);
+ else
+ r = fsync_parent_at(source_atfd, source_name);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
diff --git a/src/shared/install-file.h b/src/shared/install-file.h
new file mode 100644
index 0000000..c37254f
--- /dev/null
+++ b/src/shared/install-file.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int fs_make_very_read_only(int fd);
+
+typedef enum InstallFileFlags {
+ INSTALL_REPLACE = 1 << 0, /* Replace an existing inode */
+ INSTALL_READ_ONLY = 1 << 1, /* Call fs_make_very_read_only() to make the inode comprehensively read-only */
+ INSTALL_FSYNC = 1 << 2, /* fsync() file contents before moving file in */
+ INSTALL_FSYNC_FULL = 1 << 3, /* like INSTALL_FSYNC, but also fsync() parent dir before+after moving file in */
+ INSTALL_SYNCFS = 1 << 4, /* syncfs() before moving file in, fsync() parent dir after moving file in */
+} InstallFileFlags;
+
+int install_file(int source_atfd, const char *source_name, int target_atfd, const char *target_name, InstallFileFlags flags);
diff --git a/src/shared/install-printf.c b/src/shared/install-printf.c
new file mode 100644
index 0000000..3cc7093
--- /dev/null
+++ b/src/shared/install-printf.c
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "format-util.h"
+#include "install-printf.h"
+#include "install.h"
+#include "macro.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "unit-name.h"
+#include "user-util.h"
+
+static int specifier_prefix_and_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const InstallInfo *i = ASSERT_PTR(userdata);
+ _cleanup_free_ char *prefix = NULL;
+ int r;
+
+ r = unit_name_to_prefix_and_instance(i->name, &prefix);
+ if (r < 0)
+ return r;
+
+ if (endswith(prefix, "@") && i->default_instance) {
+ char *ans;
+
+ ans = strjoin(prefix, i->default_instance);
+ if (!ans)
+ return -ENOMEM;
+ *ret = ans;
+ } else
+ *ret = TAKE_PTR(prefix);
+
+ return 0;
+}
+
+static int specifier_name(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const InstallInfo *i = ASSERT_PTR(userdata);
+ char *ans;
+
+ if (unit_name_is_valid(i->name, UNIT_NAME_TEMPLATE) && i->default_instance)
+ return unit_name_replace_instance(i->name, i->default_instance, ret);
+
+ ans = strdup(i->name);
+ if (!ans)
+ return -ENOMEM;
+ *ret = ans;
+ return 0;
+}
+
+static int specifier_prefix(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const InstallInfo *i = ASSERT_PTR(userdata);
+
+ return unit_name_to_prefix(i->name, ret);
+}
+
+static int specifier_instance(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const InstallInfo *i = ASSERT_PTR(userdata);
+ char *instance;
+ int r;
+
+ r = unit_name_to_instance(i->name, &instance);
+ if (r < 0)
+ return r;
+
+ if (isempty(instance)) {
+ r = free_and_strdup(&instance, strempty(i->default_instance));
+ if (r < 0)
+ return r;
+ }
+
+ *ret = instance;
+ return 0;
+}
+
+static int specifier_last_component(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *prefix = NULL;
+ char *dash;
+ int r;
+
+ assert(ret);
+
+ r = specifier_prefix(specifier, data, root, userdata, &prefix);
+ if (r < 0)
+ return r;
+
+ dash = strrchr(prefix, '-');
+ if (dash) {
+ dash = strdup(dash + 1);
+ if (!dash)
+ return -ENOMEM;
+ *ret = dash;
+ } else
+ *ret = TAKE_PTR(prefix);
+
+ return 0;
+}
+
+int install_name_printf(
+ RuntimeScope scope,
+ const InstallInfo *info,
+ const char *format,
+ char **ret) {
+ /* This is similar to unit_name_printf() */
+
+ const Specifier table[] = {
+ { 'i', specifier_instance, NULL },
+ { 'j', specifier_last_component, NULL },
+ { 'n', specifier_name, NULL },
+ { 'N', specifier_prefix_and_instance, NULL },
+ { 'p', specifier_prefix, NULL },
+
+ COMMON_SYSTEM_SPECIFIERS,
+
+ COMMON_CREDS_SPECIFIERS(scope),
+ {}
+ };
+
+ assert(info);
+ assert(format);
+ assert(ret);
+
+ return specifier_printf(format, UNIT_NAME_MAX, table, info->root, info, ret);
+}
diff --git a/src/shared/install-printf.h b/src/shared/install-printf.h
new file mode 100644
index 0000000..8c7842b
--- /dev/null
+++ b/src/shared/install-printf.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "install.h"
+#include "unit-name.h"
+
+int install_name_printf(
+ RuntimeScope scope,
+ const InstallInfo *info,
+ const char *format,
+ char **ret);
diff --git a/src/shared/install.c b/src/shared/install.c
new file mode 100644
index 0000000..0f4dab4
--- /dev/null
+++ b/src/shared/install.c
@@ -0,0 +1,3760 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "conf-files.h"
+#include "conf-parser.h"
+#include "constants.h"
+#include "dirent-util.h"
+#include "errno-list.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "hashmap.h"
+#include "install-printf.h"
+#include "install.h"
+#include "locale-util.h"
+#include "log.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "set.h"
+#include "special.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unit-file.h"
+
+#define UNIT_FILE_FOLLOW_SYMLINK_MAX 64
+
+typedef enum SearchFlags {
+ SEARCH_LOAD = 1 << 0,
+ SEARCH_FOLLOW_CONFIG_SYMLINKS = 1 << 1,
+ SEARCH_DROPIN = 1 << 2,
+} SearchFlags;
+
+typedef struct {
+ RuntimeScope scope;
+ OrderedHashmap *will_process;
+ OrderedHashmap *have_processed;
+} InstallContext;
+
+struct UnitFilePresetRule {
+ char *pattern;
+ PresetAction action;
+ char **instances;
+};
+
+/* NB! strings use past tense. */
+static const char *const preset_action_past_tense_table[_PRESET_ACTION_MAX] = {
+ [PRESET_UNKNOWN] = "unknown",
+ [PRESET_ENABLE] = "enabled",
+ [PRESET_DISABLE] = "disabled",
+ [PRESET_IGNORE] = "ignored",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(preset_action_past_tense, PresetAction);
+
+static bool install_info_has_rules(const InstallInfo *i) {
+ assert(i);
+
+ return !strv_isempty(i->aliases) ||
+ !strv_isempty(i->wanted_by) ||
+ !strv_isempty(i->required_by) ||
+ !strv_isempty(i->upheld_by);
+}
+
+static bool install_info_has_also(const InstallInfo *i) {
+ assert(i);
+
+ return !strv_isempty(i->also);
+}
+
+static void unit_file_preset_rule_done(UnitFilePresetRule *rule) {
+ assert(rule);
+
+ free(rule->pattern);
+ strv_free(rule->instances);
+}
+
+void unit_file_presets_done(UnitFilePresets *p) {
+ if (!p)
+ return;
+
+ FOREACH_ARRAY(rule, p->rules, p->n_rules)
+ unit_file_preset_rule_done(rule);
+
+ free(p->rules);
+ p->n_rules = 0;
+}
+
+static const char *const install_mode_table[_INSTALL_MODE_MAX] = {
+ [INSTALL_MODE_REGULAR] = "regular",
+ [INSTALL_MODE_LINKED] = "linked",
+ [INSTALL_MODE_ALIAS] = "alias",
+ [INSTALL_MODE_MASKED] = "masked",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(install_mode, InstallMode);
+
+static int in_search_path(const LookupPaths *lp, const char *path) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ /* Check if 'path' is in lp->search_path. */
+
+ r = path_extract_directory(ASSERT_PTR(path), &parent);
+ if (r < 0)
+ return r;
+
+ return path_strv_contains(ASSERT_PTR(lp)->search_path, parent);
+}
+
+static int underneath_search_path(const LookupPaths *lp, const char *path) {
+ /* Check if 'path' is underneath lp->search_path. */
+
+ return !!path_startswith_strv(ASSERT_PTR(path), ASSERT_PTR(lp)->search_path);
+}
+
+static const char* skip_root(const char *root_dir, const char *path) {
+ assert(path);
+
+ if (!root_dir)
+ return path;
+
+ const char *e = path_startswith(path, root_dir);
+ if (!e)
+ return NULL;
+
+ /* Make sure the returned path starts with a slash */
+ if (e[0] != '/') {
+ if (e == path || e[-1] != '/')
+ return NULL;
+
+ e--;
+ }
+
+ return e;
+}
+
+static int path_is_generator(const LookupPaths *lp, const char *path) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(lp);
+ assert(path);
+
+ r = path_extract_directory(path, &parent);
+ if (r < 0)
+ return r;
+
+ return path_equal_ptr(parent, lp->generator) ||
+ path_equal_ptr(parent, lp->generator_early) ||
+ path_equal_ptr(parent, lp->generator_late);
+}
+
+static int path_is_transient(const LookupPaths *lp, const char *path) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(lp);
+ assert(path);
+
+ r = path_extract_directory(path, &parent);
+ if (r < 0)
+ return r;
+
+ return path_equal_ptr(parent, lp->transient);
+}
+
+static int path_is_control(const LookupPaths *lp, const char *path) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(lp);
+ assert(path);
+
+ r = path_extract_directory(path, &parent);
+ if (r < 0)
+ return r;
+
+ return path_equal_ptr(parent, lp->persistent_control) ||
+ path_equal_ptr(parent, lp->runtime_control);
+}
+
+static int path_is_config(const LookupPaths *lp, const char *path, bool check_parent) {
+ _cleanup_free_ char *parent = NULL;
+ int r;
+
+ assert(lp);
+ assert(path);
+
+ /* Note that we do *not* have generic checks for /etc or /run in place, since with
+ * them we couldn't discern configuration from transient or generated units */
+
+ if (check_parent) {
+ r = path_extract_directory(path, &parent);
+ if (r < 0)
+ return r;
+
+ path = parent;
+ }
+
+ return path_equal_ptr(path, lp->persistent_config) ||
+ path_equal_ptr(path, lp->runtime_config);
+}
+
+static int path_is_runtime(const LookupPaths *lp, const char *path, bool check_parent) {
+ _cleanup_free_ char *parent = NULL;
+ const char *rpath;
+ int r;
+
+ assert(lp);
+ assert(path);
+
+ /* Everything in /run is considered runtime. On top of that we also add
+ * explicit checks for the various runtime directories, as safety net. */
+
+ rpath = skip_root(lp->root_dir, path);
+ if (rpath && path_startswith(rpath, "/run"))
+ return true;
+
+ if (check_parent) {
+ r = path_extract_directory(path, &parent);
+ if (r < 0)
+ return r;
+
+ path = parent;
+ }
+
+ return path_equal_ptr(path, lp->runtime_config) ||
+ path_equal_ptr(path, lp->generator) ||
+ path_equal_ptr(path, lp->generator_early) ||
+ path_equal_ptr(path, lp->generator_late) ||
+ path_equal_ptr(path, lp->transient) ||
+ path_equal_ptr(path, lp->runtime_control);
+}
+
+static int path_is_vendor_or_generator(const LookupPaths *lp, const char *path) {
+ const char *rpath;
+
+ assert(lp);
+ assert(path);
+
+ rpath = skip_root(lp->root_dir, path);
+ if (!rpath)
+ return 0;
+
+ if (path_startswith(rpath, "/usr"))
+ return true;
+
+ if (path_is_generator(lp, rpath))
+ return true;
+
+ return path_equal(rpath, SYSTEM_DATA_UNIT_DIR);
+}
+
+static const char* config_path_from_flags(const LookupPaths *lp, UnitFileFlags flags) {
+ assert(lp);
+
+ if (FLAGS_SET(flags, UNIT_FILE_PORTABLE))
+ return FLAGS_SET(flags, UNIT_FILE_RUNTIME) ? lp->runtime_attached : lp->persistent_attached;
+ else
+ return FLAGS_SET(flags, UNIT_FILE_RUNTIME) ? lp->runtime_config : lp->persistent_config;
+}
+
+InstallChangeType install_changes_add(
+ InstallChange **changes,
+ size_t *n_changes,
+ InstallChangeType type, /* INSTALL_CHANGE_SYMLINK, _UNLINK, _IS_MASKED, _IS_DANGLING, … if positive or errno if negative */
+ const char *path,
+ const char *source) {
+
+ _cleanup_free_ char *p = NULL, *s = NULL;
+ InstallChange *c;
+ int r;
+
+ assert(!changes == !n_changes);
+ assert(INSTALL_CHANGE_TYPE_VALID(type));
+
+ /* Message formatting requires <path> to be set. */
+ assert(path);
+
+ /* Register a change or error. Note that the return value may be the error
+ * that was passed in, or -ENOMEM generated internally. */
+
+ if (!changes)
+ return type;
+
+ c = reallocarray(*changes, *n_changes + 1, sizeof(InstallChange));
+ if (!c)
+ return -ENOMEM;
+ *changes = c;
+
+ r = path_simplify_alloc(path, &p);
+ if (r < 0)
+ return r;
+
+ r = path_simplify_alloc(source, &s);
+ if (r < 0)
+ return r;
+
+ c[(*n_changes)++] = (InstallChange) {
+ .type = type,
+ .path = TAKE_PTR(p),
+ .source = TAKE_PTR(s),
+ };
+
+ return type;
+}
+
+void install_changes_free(InstallChange *changes, size_t n_changes) {
+ assert(changes || n_changes == 0);
+
+ for (size_t i = 0; i < n_changes; i++) {
+ free(changes[i].path);
+ free(changes[i].source);
+ }
+
+ free(changes);
+}
+
+void install_changes_dump(int r, const char *verb, const InstallChange *changes, size_t n_changes, bool quiet) {
+ int err = 0;
+
+ assert(changes || n_changes == 0);
+ /* If verb is not specified, errors are not allowed! */
+ assert(verb || r >= 0);
+
+ for (size_t i = 0; i < n_changes; i++) {
+ if (changes[i].type < 0)
+ assert(verb);
+ assert(changes[i].path);
+
+ /* When making changes here, make sure to also change install_error() in dbus-manager.c. */
+
+ switch (changes[i].type) {
+ case INSTALL_CHANGE_SYMLINK:
+ if (!quiet)
+ log_info("Created symlink %s %s %s.",
+ changes[i].path,
+ special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+ changes[i].source);
+ break;
+ case INSTALL_CHANGE_UNLINK:
+ if (!quiet)
+ log_info("Removed \"%s\".", changes[i].path);
+ break;
+ case INSTALL_CHANGE_IS_MASKED:
+ if (!quiet)
+ log_info("Unit %s is masked, ignoring.", changes[i].path);
+ break;
+ case INSTALL_CHANGE_IS_MASKED_GENERATOR:
+ if (!quiet)
+ log_info("Unit %s is masked via a generator and cannot be unmasked.",
+ changes[i].path);
+ break;
+ case INSTALL_CHANGE_IS_DANGLING:
+ if (!quiet)
+ log_info("Unit %s is an alias to a unit that is not present, ignoring.",
+ changes[i].path);
+ break;
+ case INSTALL_CHANGE_DESTINATION_NOT_PRESENT:
+ if (!quiet)
+ log_warning("Unit %s is added as a dependency to a non-existent unit %s.",
+ changes[i].source, changes[i].path);
+ break;
+ case INSTALL_CHANGE_AUXILIARY_FAILED:
+ if (!quiet)
+ log_warning("Failed to enable auxiliary unit %s, ignoring.", changes[i].path);
+ break;
+ case -EEXIST:
+ if (changes[i].source)
+ err = log_error_errno(changes[i].type,
+ "Failed to %s unit, file \"%s\" already exists and is a symlink to \"%s\".",
+ verb, changes[i].path, changes[i].source);
+ else
+ err = log_error_errno(changes[i].type,
+ "Failed to %s unit, file \"%s\" already exists.",
+ verb, changes[i].path);
+ break;
+ case -ERFKILL:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, unit %s is masked.",
+ verb, changes[i].path);
+ break;
+ case -EADDRNOTAVAIL:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, unit %s is transient or generated.",
+ verb, changes[i].path);
+ break;
+ case -ETXTBSY:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, file %s is under the systemd unit hierarchy already.",
+ verb, changes[i].path);
+ break;
+ case -EBADSLT:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, invalid specifier in \"%s\".",
+ verb, changes[i].path);
+ break;
+ case -EIDRM:
+ err = log_error_errno(changes[i].type, "Failed to %s %s, destination unit %s is a non-template unit.",
+ verb, changes[i].source, changes[i].path);
+ break;
+ case -EUCLEAN:
+ err = log_error_errno(changes[i].type,
+ "Failed to %s unit, \"%s\" is not a valid unit name.",
+ verb, changes[i].path);
+ break;
+ case -ELOOP:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, refusing to operate on linked unit file %s.",
+ verb, changes[i].path);
+ break;
+ case -EXDEV:
+ if (changes[i].source)
+ err = log_error_errno(changes[i].type, "Failed to %s unit, cannot alias %s as %s.",
+ verb, changes[i].source, changes[i].path);
+ else
+ err = log_error_errno(changes[i].type, "Failed to %s unit, invalid unit reference \"%s\".",
+ verb, changes[i].path);
+ break;
+ case -ENOENT:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, unit %s does not exist.",
+ verb, changes[i].path);
+ break;
+ case -EUNATCH:
+ err = log_error_errno(changes[i].type, "Failed to %s unit, cannot resolve specifiers in \"%s\".",
+ verb, changes[i].path);
+ break;
+ default:
+ assert(changes[i].type < 0);
+ err = log_error_errno(changes[i].type, "Failed to %s unit, file \"%s\": %m",
+ verb, changes[i].path);
+ }
+ }
+
+ if (r < 0 && err >= 0)
+ log_error_errno(r, "Failed to %s: %m.", verb);
+}
+
+/**
+ * Checks if two symlink targets (starting from src) are equivalent as far as the unit enablement logic is
+ * concerned. If the target is in the unit search path, then anything with the same name is equivalent.
+ * If outside the unit search path, paths must be identical.
+ */
+static int chroot_unit_symlinks_equivalent(
+ const LookupPaths *lp,
+ const char *src,
+ const char *target_a,
+ const char *target_b) {
+
+ assert(lp);
+ assert(src);
+ assert(target_a);
+ assert(target_b);
+
+ /* This will give incorrect results if the paths are relative and go outside
+ * of the chroot. False negatives are possible. */
+
+ const char *root = lp->root_dir ?: "/";
+ _cleanup_free_ char *dirname = NULL;
+ int r;
+
+ if (!path_is_absolute(target_a) || !path_is_absolute(target_b)) {
+ r = path_extract_directory(src, &dirname);
+ if (r < 0)
+ return r;
+ }
+
+ _cleanup_free_ char *a = path_join(path_is_absolute(target_a) ? root : dirname, target_a);
+ _cleanup_free_ char *b = path_join(path_is_absolute(target_b) ? root : dirname, target_b);
+ if (!a || !b)
+ return log_oom();
+
+ r = path_equal_or_inode_same(a, b, 0);
+ if (r != 0)
+ return r;
+
+ _cleanup_free_ char *a_name = NULL, *b_name = NULL;
+ r = path_extract_filename(a, &a_name);
+ if (r < 0)
+ return r;
+ r = path_extract_filename(b, &b_name);
+ if (r < 0)
+ return r;
+
+ return streq(a_name, b_name) &&
+ path_startswith_strv(a, lp->search_path) &&
+ path_startswith_strv(b, lp->search_path);
+}
+
+static int create_symlink(
+ const LookupPaths *lp,
+ const char *old_path,
+ const char *new_path,
+ bool force,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_free_ char *dest = NULL;
+ const char *rp;
+ int r;
+
+ assert(old_path);
+ assert(new_path);
+
+ rp = skip_root(lp->root_dir, old_path);
+ if (rp)
+ old_path = rp;
+
+ /* Actually create a symlink, and remember that we did. This function is
+ * smart enough to check if there's already a valid symlink in place.
+ *
+ * Returns 1 if a symlink was created or already exists and points to the
+ * right place, or negative on error.
+ */
+
+ (void) mkdir_parents_label(new_path, 0755);
+
+ if (symlink(old_path, new_path) >= 0) {
+ r = install_changes_add(changes, n_changes, INSTALL_CHANGE_SYMLINK, new_path, old_path);
+ if (r < 0)
+ return r;
+ return 1;
+ }
+
+ if (errno != EEXIST)
+ return install_changes_add(changes, n_changes, -errno, new_path, NULL);
+
+ r = readlink_malloc(new_path, &dest);
+ if (r < 0) {
+ /* translate EINVAL (non-symlink exists) to EEXIST */
+ if (r == -EINVAL)
+ r = -EEXIST;
+
+ return install_changes_add(changes, n_changes, r, new_path, NULL);
+ }
+
+ if (chroot_unit_symlinks_equivalent(lp, new_path, dest, old_path)) {
+ log_debug("Symlink %s %s %s already exists",
+ new_path, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), dest);
+ return 1;
+ }
+
+ if (!force)
+ return install_changes_add(changes, n_changes, -EEXIST, new_path, dest);
+
+ r = symlink_atomic(old_path, new_path);
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, new_path, NULL);
+
+ r = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, new_path, NULL);
+ if (r < 0)
+ return r;
+ r = install_changes_add(changes, n_changes, INSTALL_CHANGE_SYMLINK, new_path, old_path);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int mark_symlink_for_removal(
+ Set **remove_symlinks_to,
+ const char *p) {
+
+ char *n;
+ int r;
+
+ assert(p);
+
+ r = set_ensure_allocated(remove_symlinks_to, &path_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = path_simplify_alloc(p, &n);
+ if (r < 0)
+ return r;
+
+ r = set_consume(*remove_symlinks_to, n);
+ if (r == -EEXIST)
+ return 0;
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int remove_marked_symlinks_fd(
+ Set *remove_symlinks_to,
+ int fd,
+ const char *path,
+ const char *config_path,
+ const LookupPaths *lp,
+ bool dry_run,
+ bool *restart,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_closedir_ DIR *d = NULL;
+ int r = 0;
+
+ assert(remove_symlinks_to);
+ assert(fd >= 0);
+ assert(path);
+ assert(config_path);
+ assert(lp);
+ assert(restart);
+
+ d = fdopendir(fd);
+ if (!d) {
+ safe_close(fd);
+ return -errno;
+ }
+
+ rewinddir(d);
+
+ FOREACH_DIRENT(de, d, return -errno)
+
+ if (de->d_type == DT_DIR) {
+ _cleanup_free_ char *p = NULL;
+ int nfd, q;
+
+ nfd = openat(fd, de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+ if (nfd < 0) {
+ if (errno == ENOENT)
+ continue;
+
+ if (r == 0)
+ r = -errno;
+ continue;
+ }
+
+ p = path_make_absolute(de->d_name, path);
+ if (!p) {
+ safe_close(nfd);
+ return -ENOMEM;
+ }
+
+ /* This will close nfd, regardless whether it succeeds or not */
+ q = remove_marked_symlinks_fd(remove_symlinks_to, nfd, p, config_path, lp, dry_run, restart, changes, n_changes);
+ if (q < 0 && r == 0)
+ r = q;
+
+ } else if (de->d_type == DT_LNK) {
+ _cleanup_free_ char *p = NULL;
+ bool found;
+ int q;
+
+ if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY))
+ continue;
+
+ p = path_make_absolute(de->d_name, path);
+ if (!p)
+ return -ENOMEM;
+ path_simplify(p);
+
+ /* We remove all links pointing to a file or path that is marked, as well as all
+ * files sharing the same name as a file that is marked, and files sharing the same
+ * name after the instance has been removed. Do path chasing only if we don't already
+ * know that we want to remove the symlink. */
+ found = set_contains(remove_symlinks_to, de->d_name);
+
+ if (!found) {
+ _cleanup_free_ char *template = NULL;
+
+ q = unit_name_template(de->d_name, &template);
+ if (q < 0 && q != -EINVAL)
+ return q;
+ if (q >= 0)
+ found = set_contains(remove_symlinks_to, template);
+ }
+
+ if (!found) {
+ _cleanup_free_ char *dest = NULL;
+
+ q = chase(p, lp->root_dir, CHASE_NONEXISTENT, &dest, NULL);
+ if (q == -ENOENT)
+ continue;
+ if (q < 0) {
+ log_debug_errno(q, "Failed to resolve symlink \"%s\": %m", p);
+ install_changes_add(changes, n_changes, q, p, NULL);
+
+ if (r == 0)
+ r = q;
+ continue;
+ }
+
+ found = set_contains(remove_symlinks_to, dest) ||
+ set_contains(remove_symlinks_to, basename(dest));
+
+ }
+
+
+ if (!found)
+ continue;
+
+ if (!dry_run) {
+ if (unlinkat(fd, de->d_name, 0) < 0 && errno != ENOENT) {
+ if (r == 0)
+ r = -errno;
+ install_changes_add(changes, n_changes, -errno, p, NULL);
+ continue;
+ }
+
+ (void) rmdir_parents(p, config_path);
+ }
+
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, p, NULL);
+ if (q < 0)
+ return q;
+
+ /* Now, remember the full path (but with the root prefix removed) of
+ * the symlink we just removed, and remove any symlinks to it, too. */
+
+ const char *rp = skip_root(lp->root_dir, p);
+ q = mark_symlink_for_removal(&remove_symlinks_to, rp ?: p);
+ if (q < 0)
+ return q;
+ if (q > 0 && !dry_run)
+ *restart = true;
+ }
+
+ return r;
+}
+
+static int remove_marked_symlinks(
+ Set *remove_symlinks_to,
+ const char *config_path,
+ const LookupPaths *lp,
+ bool dry_run,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_close_ int fd = -EBADF;
+ bool restart;
+ int r = 0;
+
+ assert(config_path);
+ assert(lp);
+
+ if (set_size(remove_symlinks_to) <= 0)
+ return 0;
+
+ fd = open(config_path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC);
+ if (fd < 0)
+ return errno == ENOENT ? 0 : -errno;
+
+ do {
+ int q, cfd;
+ restart = false;
+
+ cfd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (cfd < 0)
+ return -errno;
+
+ /* This takes possession of cfd and closes it */
+ q = remove_marked_symlinks_fd(remove_symlinks_to, cfd, config_path, config_path, lp, dry_run, &restart, changes, n_changes);
+ if (r == 0)
+ r = q;
+ } while (restart);
+
+ return r;
+}
+
+static int is_symlink_with_known_name(const InstallInfo *i, const char *name) {
+ int r;
+
+ if (streq(name, i->name))
+ return true;
+
+ if (strv_contains(i->aliases, name))
+ return true;
+
+ /* Look for template symlink matching DefaultInstance */
+ if (i->default_instance && unit_name_is_valid(i->name, UNIT_NAME_TEMPLATE)) {
+ _cleanup_free_ char *s = NULL;
+
+ r = unit_name_replace_instance(i->name, i->default_instance, &s);
+ if (r < 0) {
+ if (r != -EINVAL)
+ return r;
+
+ } else if (streq(name, s))
+ return true;
+ }
+
+ return false;
+}
+
+static int find_symlinks_in_directory(
+ DIR *dir,
+ const char *dir_path,
+ const char *root_dir,
+ const InstallInfo *info,
+ bool ignore_destination,
+ bool match_name,
+ bool ignore_same_name,
+ const char *config_path,
+ bool *same_name_link) {
+
+ int r = 0;
+
+ FOREACH_DIRENT(de, dir, return -errno) {
+ bool found_path = false, found_dest = false, b = false;
+ int q;
+
+ if (de->d_type != DT_LNK)
+ continue;
+
+ if (!ignore_destination) {
+ _cleanup_free_ char *dest = NULL;
+
+ /* Acquire symlink destination */
+ q = readlinkat_malloc(dirfd(dir), de->d_name, &dest);
+ if (q == -ENOENT)
+ continue;
+ if (q < 0) {
+ if (r == 0)
+ r = q;
+ continue;
+ }
+
+ /* Make absolute */
+ if (!path_is_absolute(dest)) {
+ char *x;
+
+ x = path_join(dir_path, dest);
+ if (!x)
+ return -ENOMEM;
+
+ free_and_replace(dest, x);
+ }
+
+ /* Check if what the symlink points to matches what we are looking for */
+ found_dest = streq(basename(dest), info->name);
+ }
+
+ assert(unit_name_is_valid(info->name, UNIT_NAME_ANY));
+
+ /* Check if the symlink itself matches what we are looking for.
+ *
+ * If ignore_destination is specified, we only look at the source name.
+ *
+ * If ignore_same_name is specified, we are in one of the directories which
+ * have lower priority than the unit file, and even if a file or symlink with
+ * this name was found, we should ignore it. */
+
+ if (ignore_destination || !ignore_same_name)
+ found_path = streq(de->d_name, info->name);
+
+ if (!found_path && ignore_destination) {
+ _cleanup_free_ char *template = NULL;
+
+ q = unit_name_template(de->d_name, &template);
+ if (q < 0 && q != -EINVAL)
+ return q;
+ if (q >= 0)
+ found_dest = streq(template, info->name);
+ }
+
+ if (found_path && found_dest) {
+ _cleanup_free_ char *p = NULL, *t = NULL;
+
+ /* Filter out same name links in the main config path */
+ p = path_make_absolute(de->d_name, dir_path);
+ t = path_make_absolute(info->name, config_path);
+
+ if (!p || !t)
+ return -ENOMEM;
+
+ b = path_equal(p, t);
+ }
+
+ if (b)
+ *same_name_link = true;
+ else if (found_path || found_dest) {
+ if (!match_name)
+ return 1;
+
+ /* Check if symlink name is in the set of names used by [Install] */
+ q = is_symlink_with_known_name(info, de->d_name);
+ if (q < 0)
+ return q;
+ if (q > 0)
+ return 1;
+ }
+ }
+
+ return r;
+}
+
+static int find_symlinks(
+ const char *root_dir,
+ const InstallInfo *i,
+ bool match_name,
+ bool ignore_same_name,
+ const char *config_path,
+ bool *same_name_link) {
+
+ _cleanup_closedir_ DIR *config_dir = NULL;
+ int r = 0;
+
+ assert(i);
+ assert(config_path);
+ assert(same_name_link);
+
+ config_dir = opendir(config_path);
+ if (!config_dir) {
+ if (IN_SET(errno, ENOENT, ENOTDIR, EACCES))
+ return 0;
+ return -errno;
+ }
+
+ FOREACH_DIRENT(de, config_dir, return -errno) {
+ const char *suffix;
+ _cleanup_free_ const char *path = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+
+ if (de->d_type != DT_DIR)
+ continue;
+
+ suffix = strrchr(de->d_name, '.');
+ if (!STRPTR_IN_SET(suffix, ".wants", ".requires", ".upholds"))
+ continue;
+
+ path = path_join(config_path, de->d_name);
+ if (!path)
+ return -ENOMEM;
+
+ d = opendir(path);
+ if (!d) {
+ log_error_errno(errno, "Failed to open directory \"%s\" while scanning for symlinks, ignoring: %m", path);
+ continue;
+ }
+
+ r = find_symlinks_in_directory(d, path, root_dir, i,
+ /* ignore_destination= */ true,
+ /* match_name= */ match_name,
+ /* ignore_same_name= */ ignore_same_name,
+ config_path,
+ same_name_link);
+ if (r > 0)
+ return 1;
+ else if (r < 0)
+ log_debug_errno(r, "Failed to look up symlinks in \"%s\": %m", path);
+ }
+
+ /* We didn't find any suitable symlinks in .wants, .requires or .upholds directories,
+ * let's look for linked unit files in this directory. */
+ rewinddir(config_dir);
+ return find_symlinks_in_directory(config_dir, config_path, root_dir, i,
+ /* ignore_destination= */ false,
+ /* match_name= */ match_name,
+ /* ignore_same_name= */ ignore_same_name,
+ config_path,
+ same_name_link);
+}
+
+static int find_symlinks_in_scope(
+ RuntimeScope scope,
+ const LookupPaths *lp,
+ const InstallInfo *info,
+ bool match_name,
+ UnitFileState *state) {
+
+ bool same_name_link_runtime = false, same_name_link_config = false;
+ bool enabled_in_runtime = false, enabled_at_all = false;
+ bool ignore_same_name = false;
+ int r;
+
+ assert(lp);
+ assert(info);
+
+ /* As we iterate over the list of search paths in lp->search_path, we may encounter "same name"
+ * symlinks. The ones which are "below" (i.e. have lower priority) than the unit file itself are
+ * effectively masked, so we should ignore them. */
+
+ STRV_FOREACH(p, lp->search_path) {
+ bool same_name_link = false;
+
+ r = find_symlinks(lp->root_dir, info, match_name, ignore_same_name, *p, &same_name_link);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ /* We found symlinks in this dir? Yay! Let's see where precisely it is enabled. */
+
+ if (path_equal_ptr(*p, lp->persistent_config)) {
+ /* This is the best outcome, let's return it immediately. */
+ *state = UNIT_FILE_ENABLED;
+ return 1;
+ }
+
+ /* look for global enablement of user units */
+ if (scope == RUNTIME_SCOPE_USER && path_is_user_config_dir(*p)) {
+ *state = UNIT_FILE_ENABLED;
+ return 1;
+ }
+
+ r = path_is_runtime(lp, *p, false);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ enabled_in_runtime = true;
+ else
+ enabled_at_all = true;
+
+ } else if (same_name_link) {
+ if (path_equal_ptr(*p, lp->persistent_config))
+ same_name_link_config = true;
+ else {
+ r = path_is_runtime(lp, *p, false);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ same_name_link_runtime = true;
+ }
+ }
+
+ /* Check if next iteration will be "below" the unit file (either a regular file
+ * or a symlink), and hence should be ignored */
+ if (!ignore_same_name && path_startswith(info->path, *p))
+ ignore_same_name = true;
+ }
+
+ if (enabled_in_runtime) {
+ *state = UNIT_FILE_ENABLED_RUNTIME;
+ return 1;
+ }
+
+ /* Here's a special rule: if the unit we are looking for is an instance, and it symlinked in the search path
+ * outside of runtime and configuration directory, then we consider it statically enabled. Note we do that only
+ * for instance, not for regular names, as those are merely aliases, while instances explicitly instantiate
+ * something, and hence are a much stronger concept. */
+ if (enabled_at_all && unit_name_is_valid(info->name, UNIT_NAME_INSTANCE)) {
+ *state = UNIT_FILE_STATIC;
+ return 1;
+ }
+
+ /* Hmm, we didn't find it, but maybe we found the same name
+ * link? */
+ if (same_name_link_config) {
+ *state = UNIT_FILE_LINKED;
+ return 1;
+ }
+ if (same_name_link_runtime) {
+ *state = UNIT_FILE_LINKED_RUNTIME;
+ return 1;
+ }
+
+ return 0;
+}
+
+static void install_info_clear(InstallInfo *i) {
+ if (!i)
+ return;
+
+ i->name = mfree(i->name);
+ i->path = mfree(i->path);
+ i->root = mfree(i->root);
+ i->aliases = strv_free(i->aliases);
+ i->wanted_by = strv_free(i->wanted_by);
+ i->required_by = strv_free(i->required_by);
+ i->upheld_by = strv_free(i->upheld_by);
+ i->also = strv_free(i->also);
+ i->default_instance = mfree(i->default_instance);
+ i->symlink_target = mfree(i->symlink_target);
+}
+
+static InstallInfo* install_info_free(InstallInfo *i) {
+ install_info_clear(i);
+ return mfree(i);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(InstallInfo*, install_info_free);
+
+static void install_context_done(InstallContext *ctx) {
+ assert(ctx);
+
+ ctx->will_process = ordered_hashmap_free_with_destructor(ctx->will_process, install_info_free);
+ ctx->have_processed = ordered_hashmap_free_with_destructor(ctx->have_processed, install_info_free);
+}
+
+static InstallInfo *install_info_find(InstallContext *ctx, const char *name) {
+ InstallInfo *i;
+
+ i = ordered_hashmap_get(ctx->have_processed, name);
+ if (i)
+ return i;
+
+ return ordered_hashmap_get(ctx->will_process, name);
+}
+
+static int install_info_may_process(
+ const InstallInfo *i,
+ const LookupPaths *lp,
+ InstallChange **changes,
+ size_t *n_changes) {
+ assert(i);
+ assert(lp);
+
+ /* Checks whether the loaded unit file is one we should process, or is masked,
+ * transient or generated and thus not subject to enable/disable operations. */
+
+ if (i->install_mode == INSTALL_MODE_MASKED)
+ return install_changes_add(changes, n_changes, -ERFKILL, i->path, NULL);
+ if (path_is_generator(lp, i->path) ||
+ path_is_transient(lp, i->path))
+ return install_changes_add(changes, n_changes, -EADDRNOTAVAIL, i->path, NULL);
+
+ return 0;
+}
+
+/**
+ * Adds a new InstallInfo entry under name in the InstallContext.will_process
+ * hashmap, or retrieves the existing one if already present.
+ *
+ * Returns negative on error, 0 if the unit was already known, 1 otherwise.
+ */
+static int install_info_add(
+ InstallContext *ctx,
+ const char *name,
+ const char *path,
+ const char *root,
+ bool auxiliary,
+ InstallInfo **ret) {
+
+ int r;
+
+ assert(ctx);
+
+ if (!name) {
+ /* 'name' and 'path' must not both be null. Check here 'path' using assert_se() to
+ * workaround a bug in gcc that generates a -Wnonnull warning when calling basename(),
+ * but this cannot be possible in any code path (See #6119). */
+ assert_se(path);
+ name = basename(path);
+ }
+
+ if (!unit_name_is_valid(name, UNIT_NAME_ANY))
+ return -EINVAL;
+
+ InstallInfo *i = install_info_find(ctx, name);
+ if (i) {
+ i->auxiliary = i->auxiliary && auxiliary;
+
+ if (ret)
+ *ret = i;
+ return 0;
+ }
+
+ _cleanup_(install_info_freep) InstallInfo *alloc = new(InstallInfo, 1);
+ if (!alloc)
+ return -ENOMEM;
+
+ *alloc = (InstallInfo) {
+ .install_mode = _INSTALL_MODE_INVALID,
+ .auxiliary = auxiliary,
+ };
+
+ alloc->name = strdup(name);
+ if (!alloc->name)
+ return -ENOMEM;
+
+ if (root) {
+ alloc->root = strdup(root);
+ if (!alloc->root)
+ return -ENOMEM;
+ }
+
+ if (path) {
+ alloc->path = strdup(path);
+ if (!alloc->path)
+ return -ENOMEM;
+ }
+
+ r = ordered_hashmap_ensure_put(&ctx->will_process, &string_hash_ops, alloc->name, alloc);
+ if (r < 0)
+ return r;
+ i = TAKE_PTR(alloc);
+
+ if (ret)
+ *ret = i;
+ return 1;
+}
+
+static int config_parse_alias(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ UnitType type;
+
+ assert(unit);
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ type = unit_name_to_type(unit);
+ if (!unit_type_may_alias(type))
+ return log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Alias= is not allowed for %s units, ignoring.",
+ unit_type_to_string(type));
+
+ return config_parse_strv(unit, filename, line, section, section_line,
+ lvalue, ltype, rvalue, data, userdata);
+}
+
+static int config_parse_also(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ InstallInfo *info = ASSERT_PTR(userdata);
+ InstallContext *ctx = ASSERT_PTR(data);
+ int r;
+
+ assert(unit);
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *printed = NULL;
+
+ r = extract_first_word(&rvalue, &word, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ r = install_name_printf(ctx->scope, info, word, &printed);
+ if (r < 0)
+ return log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve unit name in Also=\"%s\": %m", word);
+
+ r = install_info_add(ctx, printed, NULL, info->root, /* auxiliary= */ true, NULL);
+ if (r < 0)
+ return r;
+
+ r = strv_push(&info->also, printed);
+ if (r < 0)
+ return r;
+
+ printed = NULL;
+ }
+
+ return 0;
+}
+
+static int config_parse_default_instance(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ InstallContext *ctx = ASSERT_PTR(data);
+ InstallInfo *info = ASSERT_PTR(userdata);
+ _cleanup_free_ char *printed = NULL;
+ int r;
+
+ assert(unit);
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (unit_name_is_valid(unit, UNIT_NAME_INSTANCE))
+ /* When enabling an instance, we might be using a template unit file,
+ * but we should ignore DefaultInstance silently. */
+ return 0;
+ if (!unit_name_is_valid(unit, UNIT_NAME_TEMPLATE))
+ return log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "DefaultInstance= only makes sense for template units, ignoring.");
+
+ r = install_name_printf(ctx->scope, info, rvalue, &printed);
+ if (r < 0)
+ return log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to resolve instance name in DefaultInstance=\"%s\": %m", rvalue);
+
+ if (isempty(printed))
+ printed = mfree(printed);
+
+ if (printed && !unit_instance_is_valid(printed))
+ return log_syntax(unit, LOG_WARNING, filename, line, SYNTHETIC_ERRNO(EINVAL),
+ "Invalid DefaultInstance= value \"%s\".", printed);
+
+ return free_and_replace(info->default_instance, printed);
+}
+
+static int unit_file_load(
+ InstallContext *ctx,
+ InstallInfo *info,
+ const char *path,
+ const char *root_dir,
+ SearchFlags flags) {
+
+ const ConfigTableItem items[] = {
+ { "Install", "Alias", config_parse_alias, 0, &info->aliases },
+ { "Install", "WantedBy", config_parse_strv, 0, &info->wanted_by },
+ { "Install", "RequiredBy", config_parse_strv, 0, &info->required_by },
+ { "Install", "UpheldBy", config_parse_strv, 0, &info->upheld_by },
+ { "Install", "DefaultInstance", config_parse_default_instance, 0, info },
+ { "Install", "Also", config_parse_also, 0, ctx },
+ {}
+ };
+
+ UnitType type;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+ int r;
+
+ assert(info);
+ assert(path);
+
+ if (!(flags & SEARCH_DROPIN)) {
+ /* Loading or checking for the main unit file… */
+
+ type = unit_name_to_type(info->name);
+ if (type < 0)
+ return -EINVAL;
+ if (unit_name_is_valid(info->name, UNIT_NAME_TEMPLATE|UNIT_NAME_INSTANCE) && !unit_type_may_template(type))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "%s: unit type %s cannot be templated, ignoring.", path, unit_type_to_string(type));
+
+ if (!(flags & SEARCH_LOAD)) {
+ if (lstat(path, &st) < 0)
+ return -errno;
+
+ if (null_or_empty(&st))
+ info->install_mode = INSTALL_MODE_MASKED;
+ else if (S_ISREG(st.st_mode))
+ info->install_mode = INSTALL_MODE_REGULAR;
+ else if (S_ISLNK(st.st_mode))
+ return -ELOOP;
+ else if (S_ISDIR(st.st_mode))
+ return -EISDIR;
+ else
+ return -ENOTTY;
+
+ return 0;
+ }
+
+ fd = open(path, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+ } else {
+ /* Operating on a drop-in file. If we aren't supposed to load the unit file drop-ins don't matter, let's hence shortcut this. */
+
+ if (!(flags & SEARCH_LOAD))
+ return 0;
+
+ fd = chase_and_open(path, root_dir, 0, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL);
+ if (fd < 0)
+ return fd;
+ }
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (null_or_empty(&st)) {
+ if ((flags & SEARCH_DROPIN) == 0)
+ info->install_mode = INSTALL_MODE_MASKED;
+
+ return 0;
+ }
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return r;
+
+ f = take_fdopen(&fd, "r");
+ if (!f)
+ return -errno;
+
+ /* ctx is only needed if we actually load the file (it's referenced from items[] btw, in case you wonder.) */
+ assert(ctx);
+
+ r = config_parse(info->name, path, f,
+ "Install\0"
+ "-Unit\0"
+ "-Automount\0"
+ "-Device\0"
+ "-Mount\0"
+ "-Path\0"
+ "-Scope\0"
+ "-Service\0"
+ "-Slice\0"
+ "-Socket\0"
+ "-Swap\0"
+ "-Target\0"
+ "-Timer\0",
+ config_item_table_lookup, items,
+ 0, info,
+ NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse \"%s\": %m", info->name);
+
+ if ((flags & SEARCH_DROPIN) == 0)
+ info->install_mode = INSTALL_MODE_REGULAR;
+
+ return
+ (int) strv_length(info->aliases) +
+ (int) strv_length(info->wanted_by) +
+ (int) strv_length(info->required_by) +
+ (int) strv_length(info->upheld_by);
+}
+
+static int unit_file_load_or_readlink(
+ InstallContext *ctx,
+ InstallInfo *info,
+ const char *path,
+ const LookupPaths *lp,
+ SearchFlags flags) {
+ int r;
+
+ r = unit_file_load(ctx, info, path, lp->root_dir, flags);
+ if (r != -ELOOP || (flags & SEARCH_DROPIN))
+ return r;
+
+ /* This is a symlink, let's read and verify it. */
+ r = unit_file_resolve_symlink(lp->root_dir, lp->search_path,
+ NULL, AT_FDCWD, path,
+ true, &info->symlink_target);
+ if (r < 0)
+ return r;
+ bool outside_search_path = r > 0;
+
+ r = null_or_empty_path_with_root(info->symlink_target, lp->root_dir);
+ if (r < 0 && r != -ENOENT)
+ return log_debug_errno(r, "Failed to stat %s: %m", info->symlink_target);
+ if (r > 0)
+ info->install_mode = INSTALL_MODE_MASKED;
+ else if (outside_search_path)
+ info->install_mode = INSTALL_MODE_LINKED;
+ else
+ info->install_mode = INSTALL_MODE_ALIAS;
+
+ return 0;
+}
+
+static int unit_file_search(
+ InstallContext *ctx,
+ InstallInfo *info,
+ const LookupPaths *lp,
+ SearchFlags flags) {
+
+ const char *dropin_dir_name = NULL, *dropin_template_dir_name = NULL;
+ _cleanup_strv_free_ char **dirs = NULL, **files = NULL;
+ _cleanup_free_ char *template = NULL;
+ bool found_unit = false;
+ int r, result;
+
+ assert(info);
+ assert(lp);
+
+ /* Was this unit already loaded? */
+ if (info->install_mode != _INSTALL_MODE_INVALID)
+ return 0;
+
+ if (info->path)
+ return unit_file_load_or_readlink(ctx, info, info->path, lp, flags);
+
+ assert(info->name);
+
+ if (unit_name_is_valid(info->name, UNIT_NAME_INSTANCE)) {
+ r = unit_name_template(info->name, &template);
+ if (r < 0)
+ return r;
+ }
+
+ STRV_FOREACH(p, lp->search_path) {
+ _cleanup_free_ char *path = NULL;
+
+ path = path_join(*p, info->name);
+ if (!path)
+ return -ENOMEM;
+
+ r = unit_file_load_or_readlink(ctx, info, path, lp, flags);
+ if (r >= 0) {
+ info->path = TAKE_PTR(path);
+ result = r;
+ found_unit = true;
+ break;
+ } else if (!IN_SET(r, -ENOENT, -ENOTDIR, -EACCES))
+ return r;
+ }
+
+ if (!found_unit && template) {
+
+ /* Unit file doesn't exist, however instance
+ * enablement was requested. We will check if it is
+ * possible to load template unit file. */
+
+ STRV_FOREACH(p, lp->search_path) {
+ _cleanup_free_ char *path = NULL;
+
+ path = path_join(*p, template);
+ if (!path)
+ return -ENOMEM;
+
+ r = unit_file_load_or_readlink(ctx, info, path, lp, flags);
+ if (r >= 0) {
+ info->path = TAKE_PTR(path);
+ result = r;
+ found_unit = true;
+ break;
+ } else if (!IN_SET(r, -ENOENT, -ENOTDIR, -EACCES))
+ return r;
+ }
+ }
+
+ if (!found_unit)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
+ "Cannot find unit %s%s%s.",
+ info->name, template ? " or " : "", strempty(template));
+
+ if (info->install_mode == INSTALL_MODE_MASKED)
+ return result;
+
+ /* Search for drop-in directories */
+
+ dropin_dir_name = strjoina(info->name, ".d");
+ STRV_FOREACH(p, lp->search_path) {
+ char *path;
+
+ path = path_join(*p, dropin_dir_name);
+ if (!path)
+ return -ENOMEM;
+
+ r = strv_consume(&dirs, path);
+ if (r < 0)
+ return r;
+ }
+
+ if (template) {
+ dropin_template_dir_name = strjoina(template, ".d");
+ STRV_FOREACH(p, lp->search_path) {
+ char *path;
+
+ path = path_join(*p, dropin_template_dir_name);
+ if (!path)
+ return -ENOMEM;
+
+ r = strv_consume(&dirs, path);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ /* Load drop-in conf files */
+
+ r = conf_files_list_strv(&files, ".conf", NULL, 0, (const char**) dirs);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get list of conf files: %m");
+
+ STRV_FOREACH(p, files) {
+ r = unit_file_load_or_readlink(ctx, info, *p, lp, flags | SEARCH_DROPIN);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load conf file \"%s\": %m", *p);
+ }
+
+ return result;
+}
+
+static int install_info_follow(
+ InstallContext *ctx,
+ InstallInfo *info,
+ const LookupPaths *lp,
+ SearchFlags flags,
+ bool ignore_different_name) {
+
+ assert(ctx);
+ assert(info);
+
+ if (!IN_SET(info->install_mode, INSTALL_MODE_ALIAS, INSTALL_MODE_LINKED))
+ return -EINVAL;
+ if (!info->symlink_target)
+ return -EINVAL;
+
+ /* If the basename doesn't match, the caller should add a complete new entry for this. */
+
+ if (!ignore_different_name && !streq(basename(info->symlink_target), info->name))
+ return -EXDEV;
+
+ free_and_replace(info->path, info->symlink_target);
+ info->install_mode = _INSTALL_MODE_INVALID;
+
+ return unit_file_load_or_readlink(ctx, info, info->path, lp, flags);
+}
+
+/**
+ * Search for the unit file. If the unit name is a symlink, follow the symlink to the
+ * target, maybe more than once. Propagate the instance name if present.
+ */
+static int install_info_traverse(
+ InstallContext *ctx,
+ const LookupPaths *lp,
+ InstallInfo *start,
+ SearchFlags flags,
+ InstallInfo **ret) {
+
+ InstallInfo *i;
+ unsigned k = 0;
+ int r;
+
+ assert(lp);
+ assert(start);
+ assert(ctx);
+
+ r = unit_file_search(ctx, start, lp, flags);
+ if (r < 0)
+ return r;
+
+ i = start;
+ while (IN_SET(i->install_mode, INSTALL_MODE_ALIAS, INSTALL_MODE_LINKED)) {
+ /* Follow the symlink */
+
+ if (++k > UNIT_FILE_FOLLOW_SYMLINK_MAX)
+ return -ELOOP;
+
+ if (!(flags & SEARCH_FOLLOW_CONFIG_SYMLINKS)) {
+ r = path_is_config(lp, i->path, true);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return -ELOOP;
+ }
+
+ r = install_info_follow(ctx, i, lp, flags,
+ /* If linked, don't look at the target name */
+ /* ignore_different_name= */ i->install_mode == INSTALL_MODE_LINKED);
+ if (r == -EXDEV && i->symlink_target) {
+ _cleanup_free_ char *buffer = NULL;
+ const char *bn;
+
+ /* Target is an alias, create a new install info object and continue with that. */
+
+ bn = basename(i->symlink_target);
+
+ if (unit_name_is_valid(i->name, UNIT_NAME_INSTANCE) &&
+ unit_name_is_valid(bn, UNIT_NAME_TEMPLATE)) {
+
+ _cleanup_free_ char *instance = NULL;
+
+ r = unit_name_to_instance(i->name, &instance);
+ if (r < 0)
+ return r;
+
+ r = unit_name_replace_instance(bn, instance, &buffer);
+ if (r < 0)
+ return r;
+
+ if (streq(buffer, i->name)) {
+
+ /* We filled in the instance, and the target stayed the same? If so,
+ * then let's honour the link as it is. */
+
+ r = install_info_follow(ctx, i, lp, flags, true);
+ if (r < 0)
+ return r;
+
+ continue;
+ }
+
+ bn = buffer;
+ }
+
+ r = install_info_add(ctx, bn, NULL, lp->root_dir, /* auxiliary= */ false, &i);
+ if (r < 0)
+ return r;
+
+ /* Try again, with the new target we found. */
+ r = unit_file_search(ctx, i, lp, flags);
+ if (r == -ENOENT)
+ /* Translate error code to highlight this specific case */
+ return -ENOLINK;
+ }
+
+ if (r < 0)
+ return r;
+ }
+
+ if (ret)
+ *ret = i;
+
+ return 0;
+}
+
+/**
+ * Call install_info_add() with name_or_path as the path (if name_or_path starts with "/")
+ * or the name (otherwise). root_dir is prepended to the path.
+ */
+static int install_info_add_auto(
+ InstallContext *ctx,
+ const LookupPaths *lp,
+ const char *name_or_path,
+ InstallInfo **ret) {
+
+ assert(ctx);
+ assert(name_or_path);
+
+ if (path_is_absolute(name_or_path)) {
+ const char *pp;
+
+ pp = prefix_roota(lp->root_dir, name_or_path);
+
+ return install_info_add(ctx, NULL, pp, lp->root_dir, /* auxiliary= */ false, ret);
+ } else
+ return install_info_add(ctx, name_or_path, NULL, lp->root_dir, /* auxiliary= */ false, ret);
+}
+
+static int install_info_discover(
+ InstallContext *ctx,
+ const LookupPaths *lp,
+ const char *name_or_path,
+ SearchFlags flags,
+ InstallInfo **ret,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ InstallInfo *info;
+ int r;
+
+ assert(ctx);
+ assert(lp);
+ assert(name_or_path);
+
+ r = install_info_add_auto(ctx, lp, name_or_path, &info);
+ if (r >= 0)
+ r = install_info_traverse(ctx, lp, info, flags, ret);
+
+ if (r < 0)
+ install_changes_add(changes, n_changes, r, name_or_path, NULL);
+ return r;
+}
+
+static int install_info_discover_and_check(
+ InstallContext *ctx,
+ const LookupPaths *lp,
+ const char *name_or_path,
+ SearchFlags flags,
+ InstallInfo **ret,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ int r;
+
+ r = install_info_discover(ctx, lp, name_or_path, flags, ret, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ return install_info_may_process(ret ? *ret : NULL, lp, changes, n_changes);
+}
+
+int unit_file_verify_alias(
+ const InstallInfo *info,
+ const char *dst,
+ char **ret_dst,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_free_ char *dst_updated = NULL;
+ int r;
+
+ /* Verify that dst is a valid either a valid alias or a valid .wants/.requires symlink for the target
+ * unit *i. Return negative on error or if not compatible, zero on success.
+ *
+ * ret_dst is set in cases where "instance propagation" happens, i.e. when the instance part is
+ * inserted into dst. It is not normally set, even on success, so that the caller can easily
+ * distinguish the case where instance propagation occurred.
+ *
+ * Returns:
+ * -EXDEV when the alias doesn't match the unit,
+ * -EUCLEAN when the name is invalid,
+ * -ELOOP when the alias it to the unit itself.
+ */
+
+ const char *path_alias = strrchr(dst, '/');
+ if (path_alias) {
+ /* This branch covers legacy Alias= function of creating .wants and .requires symlinks. */
+ _cleanup_free_ char *dir = NULL;
+ char *p;
+
+ path_alias ++; /* skip over slash */
+
+ r = path_extract_directory(dst, &dir);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract parent directory from '%s': %m", dst);
+
+ p = endswith(dir, ".wants");
+ if (!p)
+ p = endswith(dir, ".requires");
+ if (!p) {
+ install_changes_add(changes, n_changes, -EXDEV, dst, NULL);
+ return log_debug_errno(SYNTHETIC_ERRNO(EXDEV), "Invalid path \"%s\" in alias.", dir);
+ }
+
+ *p = '\0'; /* dir should now be a unit name */
+
+ UnitNameFlags type = unit_name_classify(dir);
+ if (type < 0) {
+ install_changes_add(changes, n_changes, -EXDEV, dst, NULL);
+ return log_debug_errno(SYNTHETIC_ERRNO(EXDEV),
+ "Invalid unit name component \"%s\" in alias.", dir);
+ }
+
+ const bool instance_propagation = type == UNIT_NAME_TEMPLATE;
+
+ /* That's the name we want to use for verification. */
+ r = unit_symlink_name_compatible(path_alias, info->name, instance_propagation);
+ if (r < 0)
+ return log_error_errno(r, "Failed to verify alias validity: %m");
+ if (r == 0) {
+ install_changes_add(changes, n_changes, -EXDEV, dst, info->name);
+ return log_debug_errno(SYNTHETIC_ERRNO(EXDEV),
+ "Invalid unit \"%s\" symlink \"%s\".",
+ info->name, dst);
+ }
+
+ } else {
+ /* If the symlink target has an instance set and the symlink source doesn't, we "propagate
+ * the instance", i.e. instantiate the symlink source with the target instance. */
+ if (unit_name_is_valid(dst, UNIT_NAME_TEMPLATE)) {
+ _cleanup_free_ char *inst = NULL;
+
+ UnitNameFlags type = unit_name_to_instance(info->name, &inst);
+ if (type < 0) {
+ install_changes_add(changes, n_changes, -EUCLEAN, info->name, NULL);
+ return log_debug_errno(type, "Failed to extract instance name from \"%s\": %m", info->name);
+ }
+
+ if (type == UNIT_NAME_INSTANCE) {
+ r = unit_name_replace_instance(dst, inst, &dst_updated);
+ if (r < 0)
+ return log_error_errno(r, "Failed to build unit name from %s+%s: %m",
+ dst, inst);
+ }
+ }
+
+ r = unit_validate_alias_symlink_or_warn(LOG_DEBUG, dst_updated ?: dst, info->name);
+ if (r == -ELOOP) /* -ELOOP means self-alias, which we (quietly) ignore */
+ return r;
+ if (r < 0)
+ return install_changes_add(changes, n_changes,
+ r == -EINVAL ? -EXDEV : r,
+ dst_updated ?: dst,
+ info->name);
+ }
+
+ *ret_dst = TAKE_PTR(dst_updated);
+ return 0;
+}
+
+static int install_info_symlink_alias(
+ RuntimeScope scope,
+ InstallInfo *info,
+ const LookupPaths *lp,
+ const char *config_path,
+ bool force,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ int r = 0, q;
+
+ assert(info);
+ assert(lp);
+ assert(config_path);
+
+ STRV_FOREACH(s, info->aliases) {
+ _cleanup_free_ char *alias_path = NULL, *dst = NULL, *dst_updated = NULL;
+ bool broken;
+
+ q = install_name_printf(scope, info, *s, &dst);
+ if (q < 0) {
+ install_changes_add(changes, n_changes, q, *s, NULL);
+ r = r < 0 ? r : q;
+ continue;
+ }
+
+ q = unit_file_verify_alias(info, dst, &dst_updated, changes, n_changes);
+ if (q == -ELOOP)
+ continue;
+ if (q < 0) {
+ r = r < 0 ? r : q;
+ continue;
+ }
+
+ alias_path = path_make_absolute(dst_updated ?: dst, config_path);
+ if (!alias_path)
+ return -ENOMEM;
+
+ q = chase(alias_path, lp->root_dir, CHASE_NONEXISTENT, NULL, NULL);
+ if (q < 0 && q != -ENOENT) {
+ r = r < 0 ? r : q;
+ continue;
+ }
+ broken = q == 0; /* symlink target does not exist? */
+
+ q = create_symlink(lp, info->path, alias_path, force || broken, changes, n_changes);
+ r = r < 0 ? r : q;
+ }
+
+ return r;
+}
+
+static int install_info_symlink_wants(
+ RuntimeScope scope,
+ UnitFileFlags file_flags,
+ InstallInfo *info,
+ const LookupPaths *lp,
+ const char *config_path,
+ char **list,
+ const char *suffix,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(install_info_clear) InstallInfo instance = {
+ .install_mode = _INSTALL_MODE_INVALID,
+ };
+
+ UnitNameFlags valid_dst_type = UNIT_NAME_ANY;
+ const char *n;
+ int r = 0, q;
+
+ assert(info);
+ assert(lp);
+ assert(config_path);
+
+ if (strv_isempty(list))
+ return 0;
+
+ if (unit_name_is_valid(info->name, UNIT_NAME_PLAIN | UNIT_NAME_INSTANCE))
+ /* Not a template unit. Use the name directly. */
+ n = info->name;
+
+ else if (info->default_instance) {
+ /* If this is a template, and we have a default instance, use it. */
+
+ r = unit_name_replace_instance(info->name, info->default_instance, &instance.name);
+ if (r < 0)
+ return r;
+
+ r = unit_file_search(NULL, &instance, lp, SEARCH_FOLLOW_CONFIG_SYMLINKS);
+ if (r < 0)
+ return r;
+
+ if (instance.install_mode == INSTALL_MODE_MASKED)
+ return install_changes_add(changes, n_changes, -ERFKILL, instance.path, NULL);
+
+ n = instance.name;
+
+ } else {
+ /* We have a template, but no instance yet. When used with an instantiated unit, we will get
+ * the instance from that unit. Cannot be used with non-instance units. */
+
+ valid_dst_type = UNIT_NAME_INSTANCE | UNIT_NAME_TEMPLATE;
+ n = info->name;
+ }
+
+ r = 0;
+ STRV_FOREACH(s, list) {
+ _cleanup_free_ char *path = NULL, *dst = NULL;
+
+ q = install_name_printf(scope, info, *s, &dst);
+ if (q < 0) {
+ install_changes_add(changes, n_changes, q, *s, NULL);
+ if (r >= 0)
+ r = q;
+
+ continue;
+ }
+
+ if (!unit_name_is_valid(dst, valid_dst_type)) {
+ /* Generate a proper error here: EUCLEAN if the name is generally bad, EIDRM if the
+ * template status doesn't match. If we are doing presets don't bother reporting the
+ * error. This also covers cases like 'systemctl preset serial-getty@.service', which
+ * has no DefaultInstance, so there is nothing we can do. At the same time,
+ * 'systemctl enable serial-getty@.service' should fail, the user should specify an
+ * instance like in 'systemctl enable serial-getty@ttyS0.service'.
+ */
+ if (file_flags & UNIT_FILE_IGNORE_AUXILIARY_FAILURE)
+ continue;
+
+ if (unit_name_is_valid(dst, UNIT_NAME_ANY))
+ q = install_changes_add(changes, n_changes, -EIDRM, dst, n);
+ else
+ q = install_changes_add(changes, n_changes, -EUCLEAN, dst, NULL);
+ if (r >= 0)
+ r = q;
+
+ continue;
+ }
+
+ path = strjoin(config_path, "/", dst, suffix, n);
+ if (!path)
+ return -ENOMEM;
+
+ q = create_symlink(lp, info->path, path, true, changes, n_changes);
+ if ((q < 0 && r >= 0) || r == 0)
+ r = q;
+
+ if (unit_file_exists(scope, lp, dst) == 0) {
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_DESTINATION_NOT_PRESENT, dst, info->path);
+ if (q < 0)
+ return q;
+ }
+ }
+
+ return r;
+}
+
+static int install_info_symlink_link(
+ InstallInfo *info,
+ const LookupPaths *lp,
+ const char *config_path,
+ bool force,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ assert(info);
+ assert(lp);
+ assert(config_path);
+ assert(info->path);
+
+ r = in_search_path(lp, info->path);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 0;
+
+ path = path_join(config_path, info->name);
+ if (!path)
+ return -ENOMEM;
+
+ return create_symlink(lp, info->path, path, force, changes, n_changes);
+}
+
+static int install_info_apply(
+ RuntimeScope scope,
+ UnitFileFlags file_flags,
+ InstallInfo *info,
+ const LookupPaths *lp,
+ const char *config_path,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ int r, q;
+
+ assert(info);
+ assert(lp);
+ assert(config_path);
+
+ if (info->install_mode != INSTALL_MODE_REGULAR)
+ return 0;
+
+ bool force = file_flags & UNIT_FILE_FORCE;
+
+ r = install_info_symlink_link(info, lp, config_path, force, changes, n_changes);
+ /* Do not count links to the unit file towards the "carries_install_info" count */
+ if (r < 0)
+ /* If linking of the file failed, do not try to create other symlinks,
+ * because they might would pointing to a non-existent or wrong unit. */
+ return r;
+
+ r = install_info_symlink_alias(scope, info, lp, config_path, force, changes, n_changes);
+
+ q = install_info_symlink_wants(scope, file_flags, info, lp, config_path, info->wanted_by, ".wants/", changes, n_changes);
+ if (r == 0)
+ r = q;
+
+ q = install_info_symlink_wants(scope, file_flags, info, lp, config_path, info->required_by, ".requires/", changes, n_changes);
+ if (r == 0)
+ r = q;
+
+ q = install_info_symlink_wants(scope, file_flags, info, lp, config_path, info->upheld_by, ".upholds/", changes, n_changes);
+ if (r == 0)
+ r = q;
+
+ return r;
+}
+
+static int install_context_apply(
+ InstallContext *ctx,
+ const LookupPaths *lp,
+ UnitFileFlags file_flags,
+ const char *config_path,
+ SearchFlags flags,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ InstallInfo *i;
+ int r;
+
+ assert(ctx);
+ assert(lp);
+ assert(config_path);
+
+ if (ordered_hashmap_isempty(ctx->will_process))
+ return 0;
+
+ r = ordered_hashmap_ensure_allocated(&ctx->have_processed, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ r = 0;
+ while ((i = ordered_hashmap_first(ctx->will_process))) {
+ int q;
+
+ q = ordered_hashmap_move_one(ctx->have_processed, ctx->will_process, i->name);
+ if (q < 0)
+ return q;
+
+ q = install_info_traverse(ctx, lp, i, flags, NULL);
+ if (q < 0) {
+ if (i->auxiliary) {
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_AUXILIARY_FAILED, i->name, NULL);
+ if (q < 0)
+ return q;
+ continue;
+ }
+
+ return install_changes_add(changes, n_changes, q, i->name, NULL);
+ }
+
+ /* We can attempt to process a masked unit when a different unit
+ * that we were processing specifies it in Also=. */
+ if (i->install_mode == INSTALL_MODE_MASKED) {
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_IS_MASKED, i->path, NULL);
+ if (q < 0)
+ return q;
+ if (r >= 0)
+ /* Assume that something *could* have been enabled here,
+ * avoid "empty [Install] section" warning. */
+ r += 1;
+ continue;
+ }
+
+ if (i->install_mode != INSTALL_MODE_REGULAR)
+ continue;
+
+ q = install_info_apply(ctx->scope, file_flags, i, lp, config_path, changes, n_changes);
+ if (r >= 0) {
+ if (q < 0)
+ r = q;
+ else
+ r += q;
+ }
+ }
+
+ return r;
+}
+
+static int install_context_mark_for_removal(
+ InstallContext *ctx,
+ const LookupPaths *lp,
+ Set **remove_symlinks_to,
+ const char *config_path,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ InstallInfo *i;
+ int r;
+
+ assert(ctx);
+ assert(lp);
+ assert(config_path);
+
+ /* Marks all items for removal */
+
+ if (ordered_hashmap_isempty(ctx->will_process))
+ return 0;
+
+ r = ordered_hashmap_ensure_allocated(&ctx->have_processed, &string_hash_ops);
+ if (r < 0)
+ return r;
+
+ while ((i = ordered_hashmap_first(ctx->will_process))) {
+
+ r = ordered_hashmap_move_one(ctx->have_processed, ctx->will_process, i->name);
+ if (r < 0)
+ return r;
+
+ r = install_info_traverse(ctx, lp, i, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, NULL);
+ if (r == -ENOLINK) {
+ log_debug_errno(r, "Name %s leads to a dangling symlink, removing name.", i->name);
+ r = install_changes_add(changes, n_changes, INSTALL_CHANGE_IS_DANGLING, i->path ?: i->name, NULL);
+ if (r < 0)
+ return r;
+ } else if (r == -ENOENT) {
+ if (i->auxiliary) /* some unit specified in Also= or similar is missing */
+ log_debug_errno(r, "Auxiliary unit of %s not found, removing name.", i->name);
+ else {
+ log_debug_errno(r, "Unit %s not found, removing name.", i->name);
+ r = install_changes_add(changes, n_changes, r, i->path ?: i->name, NULL);
+ if (r < 0)
+ return r;
+ }
+ } else if (r < 0) {
+ log_debug_errno(r, "Failed to find unit %s, removing name: %m", i->name);
+ install_changes_add(changes, n_changes, r, i->path ?: i->name, NULL);
+ } else if (i->install_mode == INSTALL_MODE_MASKED) {
+ log_debug("Unit file %s is masked, ignoring.", i->name);
+ install_changes_add(changes, n_changes, INSTALL_CHANGE_IS_MASKED, i->path ?: i->name, NULL);
+ continue;
+ } else if (i->install_mode != INSTALL_MODE_REGULAR) {
+ log_debug("Unit %s has install mode %s, ignoring.",
+ i->name, install_mode_to_string(i->install_mode) ?: "invalid");
+ continue;
+ }
+
+ r = mark_symlink_for_removal(remove_symlinks_to, i->name);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int unit_file_mask(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ const char *config_path;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ config_path = (flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config;
+ if (!config_path)
+ return -ENXIO;
+
+ STRV_FOREACH(name, names) {
+ _cleanup_free_ char *path = NULL;
+ int q;
+
+ if (!unit_name_is_valid(*name, UNIT_NAME_ANY)) {
+ if (r == 0)
+ r = -EINVAL;
+ continue;
+ }
+
+ path = path_make_absolute(*name, config_path);
+ if (!path)
+ return -ENOMEM;
+
+ q = create_symlink(&lp, "/dev/null", path, flags & UNIT_FILE_FORCE, changes, n_changes);
+ if (q < 0 && r >= 0)
+ r = q;
+ }
+
+ return r;
+}
+
+int unit_file_unmask(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_set_free_free_ Set *remove_symlinks_to = NULL;
+ _cleanup_strv_free_ char **todo = NULL;
+ const char *config_path;
+ size_t n_todo = 0;
+ int r, q;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ config_path = (flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config;
+ if (!config_path)
+ return -ENXIO;
+
+ bool dry_run = flags & UNIT_FILE_DRY_RUN;
+
+ STRV_FOREACH(name, names) {
+ if (!unit_name_is_valid(*name, UNIT_NAME_ANY))
+ return -EINVAL;
+
+ /* If root_dir is set, we don't care about kernel command line or generators.
+ * But if it is not set, we need to check for interference. */
+ if (!root_dir) {
+ _cleanup_(install_info_clear) InstallInfo info = {
+ .name = *name, /* We borrow *name temporarily… */
+ .install_mode = _INSTALL_MODE_INVALID,
+ };
+
+ r = unit_file_search(NULL, &info, &lp, 0);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_debug_errno(r, "Failed to look up unit %s, ignoring: %m", info.name);
+ } else if (info.install_mode == INSTALL_MODE_MASKED &&
+ path_is_generator(&lp, info.path)) {
+ r = install_changes_add(changes, n_changes,
+ INSTALL_CHANGE_IS_MASKED_GENERATOR, info.name, info.path);
+ if (r < 0)
+ return r;
+ }
+
+ TAKE_PTR(info.name); /* … and give it back here */
+ }
+
+ _cleanup_free_ char *path = path_make_absolute(*name, config_path);
+ if (!path)
+ return -ENOMEM;
+
+ r = null_or_empty_path(path);
+ if (r == -ENOENT)
+ continue;
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+
+ if (!GREEDY_REALLOC0(todo, n_todo + 2))
+ return -ENOMEM;
+
+ todo[n_todo] = strdup(*name);
+ if (!todo[n_todo])
+ return -ENOMEM;
+
+ n_todo++;
+ }
+
+ strv_uniq(todo);
+
+ r = 0;
+ STRV_FOREACH(i, todo) {
+ _cleanup_free_ char *path = NULL;
+ const char *rp;
+
+ path = path_make_absolute(*i, config_path);
+ if (!path)
+ return -ENOMEM;
+
+ if (!dry_run && unlink(path) < 0) {
+ if (errno != ENOENT) {
+ if (r >= 0)
+ r = -errno;
+ install_changes_add(changes, n_changes, -errno, path, NULL);
+ }
+
+ continue;
+ }
+
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, path, NULL);
+ if (q < 0)
+ return q;
+
+ rp = skip_root(lp.root_dir, path);
+ q = mark_symlink_for_removal(&remove_symlinks_to, rp ?: path);
+ if (q < 0)
+ return q;
+ }
+
+ q = remove_marked_symlinks(remove_symlinks_to, config_path, &lp, dry_run, changes, n_changes);
+ if (r >= 0)
+ r = q;
+
+ return r;
+}
+
+int unit_file_link(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **files,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_strv_free_ char **todo = NULL;
+ const char *config_path;
+ size_t n_todo = 0;
+ int r, q;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ config_path = (flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config;
+ if (!config_path)
+ return -ENXIO;
+
+ STRV_FOREACH(file, files) {
+ _cleanup_free_ char *full = NULL;
+ struct stat st;
+ char *fn;
+
+ if (!path_is_absolute(*file))
+ return install_changes_add(changes, n_changes, -EINVAL, *file, NULL);
+
+ fn = basename(*file);
+ if (!unit_name_is_valid(fn, UNIT_NAME_ANY))
+ return install_changes_add(changes, n_changes, -EUCLEAN, *file, NULL);
+
+ full = path_join(lp.root_dir, *file);
+ if (!full)
+ return -ENOMEM;
+
+ if (lstat(full, &st) < 0)
+ return install_changes_add(changes, n_changes, -errno, *file, NULL);
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, *file, NULL);
+
+ r = in_search_path(&lp, *file);
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, *file, NULL);
+ if (r > 0)
+ /* A silent noop if the file is already in the search path. */
+ continue;
+
+ r = underneath_search_path(&lp, *file);
+ if (r > 0)
+ r = -ETXTBSY;
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, *file, NULL);
+
+ if (!GREEDY_REALLOC0(todo, n_todo + 2))
+ return -ENOMEM;
+
+ todo[n_todo] = strdup(*file);
+ if (!todo[n_todo])
+ return -ENOMEM;
+
+ n_todo++;
+ }
+
+ strv_uniq(todo);
+
+ r = 0;
+ STRV_FOREACH(i, todo) {
+ _cleanup_free_ char *new_path = NULL;
+
+ new_path = path_make_absolute(basename(*i), config_path);
+ if (!new_path)
+ return -ENOMEM;
+
+ q = create_symlink(&lp, *i, new_path, flags & UNIT_FILE_FORCE, changes, n_changes);
+ if (q < 0 && r >= 0)
+ r = q;
+ }
+
+ return r;
+}
+
+static int path_shall_revert(const LookupPaths *lp, const char *path) {
+ int r;
+
+ assert(lp);
+ assert(path);
+
+ /* Checks whether the path is one where the drop-in directories shall be removed. */
+
+ r = path_is_config(lp, path, true);
+ if (r != 0)
+ return r;
+
+ r = path_is_control(lp, path);
+ if (r != 0)
+ return r;
+
+ return path_is_transient(lp, path);
+}
+
+int unit_file_revert(
+ RuntimeScope scope,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_set_free_free_ Set *remove_symlinks_to = NULL;
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_strv_free_ char **todo = NULL;
+ size_t n_todo = 0;
+ int r, q;
+
+ /* Puts a unit file back into vendor state. This means:
+ *
+ * a) we remove all drop-in snippets added by the user ("config"), add to transient units
+ * ("transient"), and added via "systemctl set-property" ("control"), but not if the drop-in is
+ * generated ("generated").
+ *
+ * c) if there's a vendor unit file (i.e. one in /usr) we remove any configured overriding unit files
+ * (i.e. in "config", but not in "transient" or "control" or even "generated").
+ *
+ * We remove all that in both the runtime and the persistent directories, if that applies.
+ */
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(name, names) {
+ bool has_vendor = false;
+
+ if (!unit_name_is_valid(*name, UNIT_NAME_ANY))
+ return -EINVAL;
+
+ STRV_FOREACH(p, lp.search_path) {
+ _cleanup_free_ char *path = NULL, *dropin = NULL;
+ struct stat st;
+
+ path = path_make_absolute(*name, *p);
+ if (!path)
+ return -ENOMEM;
+
+ r = RET_NERRNO(lstat(path, &st));
+ if (r < 0) {
+ if (r != -ENOENT)
+ return install_changes_add(changes, n_changes, r, path, NULL);
+ } else if (S_ISREG(st.st_mode)) {
+ /* Check if there's a vendor version */
+ r = path_is_vendor_or_generator(&lp, path);
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, path, NULL);
+ if (r > 0)
+ has_vendor = true;
+ }
+
+ dropin = strjoin(path, ".d");
+ if (!dropin)
+ return -ENOMEM;
+
+ r = RET_NERRNO(lstat(dropin, &st));
+ if (r < 0) {
+ if (r != -ENOENT)
+ return install_changes_add(changes, n_changes, r, dropin, NULL);
+ } else if (S_ISDIR(st.st_mode)) {
+ /* Remove the drop-ins */
+ r = path_shall_revert(&lp, dropin);
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, dropin, NULL);
+ if (r > 0) {
+ if (!GREEDY_REALLOC0(todo, n_todo + 2))
+ return -ENOMEM;
+
+ todo[n_todo++] = TAKE_PTR(dropin);
+ }
+ }
+ }
+
+ if (!has_vendor)
+ continue;
+
+ /* OK, there's a vendor version, hence drop all configuration versions */
+ STRV_FOREACH(p, lp.search_path) {
+ _cleanup_free_ char *path = NULL;
+ struct stat st;
+
+ path = path_make_absolute(*name, *p);
+ if (!path)
+ return -ENOMEM;
+
+ r = RET_NERRNO(lstat(path, &st));
+ if (r < 0) {
+ if (r != -ENOENT)
+ return install_changes_add(changes, n_changes, r, path, NULL);
+ } else if (S_ISREG(st.st_mode) || S_ISLNK(st.st_mode)) {
+ r = path_is_config(&lp, path, true);
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, path, NULL);
+ if (r > 0) {
+ if (!GREEDY_REALLOC0(todo, n_todo + 2))
+ return -ENOMEM;
+
+ todo[n_todo++] = TAKE_PTR(path);
+ }
+ }
+ }
+ }
+
+ strv_uniq(todo);
+
+ r = 0;
+ STRV_FOREACH(i, todo) {
+ _cleanup_strv_free_ char **fs = NULL;
+ const char *rp;
+
+ (void) get_files_in_directory(*i, &fs);
+
+ q = rm_rf(*i, REMOVE_ROOT|REMOVE_PHYSICAL);
+ if (q < 0 && q != -ENOENT && r >= 0) {
+ r = q;
+ continue;
+ }
+
+ STRV_FOREACH(j, fs) {
+ _cleanup_free_ char *t = NULL;
+
+ t = path_join(*i, *j);
+ if (!t)
+ return -ENOMEM;
+
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, t, NULL);
+ if (q < 0)
+ return q;
+ }
+
+ q = install_changes_add(changes, n_changes, INSTALL_CHANGE_UNLINK, *i, NULL);
+ if (q < 0)
+ return q;
+
+ rp = skip_root(lp.root_dir, *i);
+ q = mark_symlink_for_removal(&remove_symlinks_to, rp ?: *i);
+ if (q < 0)
+ return q;
+ }
+
+ q = remove_marked_symlinks(remove_symlinks_to, lp.runtime_config, &lp, false, changes, n_changes);
+ if (r >= 0)
+ r = q;
+
+ q = remove_marked_symlinks(remove_symlinks_to, lp.persistent_config, &lp, false, changes, n_changes);
+ if (r >= 0)
+ r = q;
+
+ return r;
+}
+
+int unit_file_add_dependency(
+ RuntimeScope scope,
+ UnitFileFlags file_flags,
+ const char *root_dir,
+ char **names,
+ const char *target,
+ UnitDependency dep,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ InstallInfo *info, *target_info;
+ const char *config_path;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(target);
+ assert(IN_SET(dep, UNIT_WANTS, UNIT_REQUIRES));
+
+ if (!unit_name_is_valid(target, UNIT_NAME_ANY))
+ return install_changes_add(changes, n_changes, -EUCLEAN, target, NULL);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ config_path = (file_flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config;
+ if (!config_path)
+ return -ENXIO;
+
+ r = install_info_discover_and_check(&ctx, &lp, target, SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &target_info, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ assert(target_info->install_mode == INSTALL_MODE_REGULAR);
+
+ STRV_FOREACH(name, names) {
+ char ***l;
+
+ r = install_info_discover_and_check(&ctx, &lp, *name,
+ SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ assert(info->install_mode == INSTALL_MODE_REGULAR);
+
+ /* We didn't actually load anything from the unit
+ * file, but instead just add in our new symlink to
+ * create. */
+
+ if (dep == UNIT_WANTS)
+ l = &info->wanted_by;
+ else if (dep == UNIT_REQUIRES)
+ l = &info->required_by;
+ else
+ l = &info->upheld_by;
+
+ strv_free(*l);
+ *l = strv_new(target_info->name);
+ if (!*l)
+ return -ENOMEM;
+ }
+
+ return install_context_apply(&ctx, &lp, file_flags, config_path,
+ SEARCH_FOLLOW_CONFIG_SYMLINKS, changes, n_changes);
+}
+
+static int do_unit_file_enable(
+ const LookupPaths *lp,
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *config_path,
+ char **names_or_paths,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ InstallInfo *info;
+ int r;
+
+ STRV_FOREACH(name, names_or_paths) {
+ r = install_info_discover_and_check(&ctx, lp, *name,
+ SEARCH_LOAD | SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ assert(info->install_mode == INSTALL_MODE_REGULAR);
+ }
+
+ /* This will return the number of symlink rules that were
+ supposed to be created, not the ones actually created. This
+ is useful to determine whether the passed units had any
+ installation data at all. */
+
+ return install_context_apply(&ctx, lp, flags, config_path,
+ SEARCH_LOAD, changes, n_changes);
+}
+
+int unit_file_enable(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names_or_paths,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ const char *config_path = config_path_from_flags(&lp, flags);
+ if (!config_path)
+ return -ENXIO;
+
+ return do_unit_file_enable(&lp, scope, flags, config_path, names_or_paths, changes, n_changes);
+}
+
+static int do_unit_file_disable(
+ const LookupPaths *lp,
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *config_path,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ _cleanup_set_free_free_ Set *remove_symlinks_to = NULL;
+ InstallInfo *info;
+ bool has_install_info = false;
+ int r;
+
+ STRV_FOREACH(name, names) {
+ if (!unit_name_is_valid(*name, UNIT_NAME_ANY))
+ return install_changes_add(changes, n_changes, -EUCLEAN, *name, NULL);
+
+ r = install_info_add(&ctx, *name, NULL, lp->root_dir, /* auxiliary= */ false, &info);
+ if (r >= 0)
+ r = install_info_traverse(&ctx, lp, info, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, NULL);
+
+ if (r < 0)
+ return install_changes_add(changes, n_changes, r, *name, NULL);
+
+ /* If we enable multiple units, some with install info and others without,
+ * the "empty [Install] section" warning is not shown. Let's make the behavior
+ * of disable align with that. */
+ has_install_info = has_install_info || install_info_has_rules(info) || install_info_has_also(info);
+ }
+
+ r = install_context_mark_for_removal(&ctx, lp, &remove_symlinks_to, config_path, changes, n_changes);
+ if (r >= 0)
+ r = remove_marked_symlinks(remove_symlinks_to, config_path, lp, flags & UNIT_FILE_DRY_RUN, changes, n_changes);
+
+ if (r < 0)
+ return r;
+
+ /* The warning is shown only if it's a no-op */
+ return install_changes_have_modification(*changes, *n_changes) || has_install_info;
+}
+
+int unit_file_disable(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **files,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ const char *config_path = config_path_from_flags(&lp, flags);
+ if (!config_path)
+ return -ENXIO;
+
+ return do_unit_file_disable(&lp, scope, flags, config_path, files, changes, n_changes);
+}
+
+static int normalize_linked_files(
+ RuntimeScope scope,
+ const LookupPaths *lp,
+ char **names_or_paths,
+ char ***ret_names,
+ char ***ret_files) {
+
+ /* This is similar to normalize_filenames()/normalize_names() in src/systemctl/,
+ * but operates on real unit names. For each argument we look up the actual path
+ * where the unit is found. This way linked units can be re-enabled successfully. */
+
+ _cleanup_strv_free_ char **files = NULL, **names = NULL;
+ int r;
+
+ STRV_FOREACH(a, names_or_paths) {
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ InstallInfo *i = NULL;
+ _cleanup_free_ char *n = NULL;
+
+ r = path_extract_filename(*a, &n);
+ if (r < 0)
+ return r;
+ if (r == O_DIRECTORY)
+ return log_debug_errno(SYNTHETIC_ERRNO(EISDIR),
+ "Unexpected path to a directory \"%s\", refusing.", *a);
+
+ if (!is_path(*a)) {
+ r = install_info_discover(&ctx, lp, n, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS, &i, NULL, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to discover unit \"%s\", operating on name: %m", n);
+ }
+
+ r = strv_consume(&names, TAKE_PTR(n));
+ if (r < 0)
+ return r;
+
+ const char *p = NULL;
+ if (i && i->path && i->root)
+ /* Use startswith here, because we know that paths are normalized, and
+ * path_startswith() would give us a relative path, but we need an absolute path
+ * relative to i->root.
+ *
+ * In other words: /var/tmp/instroot.1234/etc/systemd/system/frobnicator.service
+ * is replaced by /etc/systemd/system/frobnicator.service, which is "absolute"
+ * in a sense, but only makes sense "relative" to /var/tmp/instroot.1234/.
+ */
+ p = startswith(i->path, i->root);
+
+ r = strv_extend(&files, p ?: *a);
+ if (r < 0)
+ return r;
+ }
+
+ *ret_names = TAKE_PTR(names);
+ *ret_files = TAKE_PTR(files);
+ return 0;
+}
+
+int unit_file_reenable(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names_or_paths,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_strv_free_ char **names = NULL, **files = NULL;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ const char *config_path = config_path_from_flags(&lp, flags);
+ if (!config_path)
+ return -ENXIO;
+
+ r = normalize_linked_files(scope, &lp, names_or_paths, &names, &files);
+ if (r < 0)
+ return r;
+
+ /* First, we invoke the disable command with only the basename... */
+ r = do_unit_file_disable(&lp, scope, flags, config_path, names, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ /* But the enable command with the full name */
+ return do_unit_file_enable(&lp, scope, flags, config_path, files, changes, n_changes);
+}
+
+int unit_file_set_default(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ const char *name,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ InstallInfo *info;
+ const char *new_path;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(name);
+
+ if (unit_name_to_type(name) != UNIT_TARGET) /* this also validates the name */
+ return -EINVAL;
+ if (streq(name, SPECIAL_DEFAULT_TARGET))
+ return -EINVAL;
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ r = install_info_discover_and_check(&ctx, &lp, name, 0, &info, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ new_path = strjoina(lp.persistent_config, "/" SPECIAL_DEFAULT_TARGET);
+ return create_symlink(&lp, info->path, new_path, flags & UNIT_FILE_FORCE, changes, n_changes);
+}
+
+int unit_file_get_default(
+ RuntimeScope scope,
+ const char *root_dir,
+ char **name) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ InstallInfo *info;
+ char *n;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(name);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ r = install_info_discover(&ctx, &lp, SPECIAL_DEFAULT_TARGET, SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ n = strdup(info->name);
+ if (!n)
+ return -ENOMEM;
+
+ *name = n;
+ return 0;
+}
+
+int unit_file_lookup_state(
+ RuntimeScope scope,
+ const LookupPaths *lp,
+ const char *name,
+ UnitFileState *ret) {
+
+ _cleanup_(install_context_done) InstallContext ctx = { .scope = scope };
+ InstallInfo *info;
+ UnitFileState state;
+ int r;
+
+ assert(lp);
+ assert(name);
+
+ if (!unit_name_is_valid(name, UNIT_NAME_ANY))
+ return -EINVAL;
+
+ r = install_info_discover(&ctx, lp, name, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, NULL, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to discover unit %s: %m", name);
+
+ assert(IN_SET(info->install_mode, INSTALL_MODE_REGULAR, INSTALL_MODE_MASKED));
+ log_debug("Found unit %s at %s (%s)", name, strna(info->path),
+ info->install_mode == INSTALL_MODE_REGULAR ? "regular file" : "mask");
+
+ /* Shortcut things, if the caller just wants to know if this unit exists. */
+ if (!ret)
+ return 0;
+
+ switch (info->install_mode) {
+
+ case INSTALL_MODE_MASKED:
+ r = path_is_runtime(lp, info->path, true);
+ if (r < 0)
+ return r;
+
+ state = r > 0 ? UNIT_FILE_MASKED_RUNTIME : UNIT_FILE_MASKED;
+ break;
+
+ case INSTALL_MODE_REGULAR:
+ /* Check if the name we were querying is actually an alias */
+ if (!streq(name, basename(info->path)) && !unit_name_is_valid(info->name, UNIT_NAME_INSTANCE)) {
+ state = UNIT_FILE_ALIAS;
+ break;
+ }
+
+ r = path_is_generator(lp, info->path);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ state = UNIT_FILE_GENERATED;
+ break;
+ }
+
+ r = path_is_transient(lp, info->path);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ state = UNIT_FILE_TRANSIENT;
+ break;
+ }
+
+ /* Check if any of the Alias= symlinks have been created.
+ * We ignore other aliases, and only check those that would
+ * be created by systemctl enable for this unit. */
+ r = find_symlinks_in_scope(scope, lp, info, true, &state);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ break;
+
+ /* Check if the file is known under other names. If it is,
+ * it might be in use. Report that as UNIT_FILE_INDIRECT. */
+ r = find_symlinks_in_scope(scope, lp, info, false, &state);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ state = UNIT_FILE_INDIRECT;
+ else {
+ if (install_info_has_rules(info))
+ state = UNIT_FILE_DISABLED;
+ else if (install_info_has_also(info))
+ state = UNIT_FILE_INDIRECT;
+ else
+ state = UNIT_FILE_STATIC;
+ }
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ *ret = state;
+ return 0;
+}
+
+int unit_file_get_state(
+ RuntimeScope scope,
+ const char *root_dir,
+ const char *name,
+ UnitFileState *ret) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(name);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ return unit_file_lookup_state(scope, &lp, name, ret);
+}
+
+int unit_file_exists(RuntimeScope scope, const LookupPaths *lp, const char *name) {
+ _cleanup_(install_context_done) InstallContext c = { .scope = scope };
+ int r;
+
+ assert(lp);
+ assert(name);
+
+ if (!unit_name_is_valid(name, UNIT_NAME_ANY))
+ return -EINVAL;
+
+ r = install_info_discover(&c, lp, name, 0, NULL, NULL, NULL);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+static int split_pattern_into_name_and_instances(const char *pattern, char **out_unit_name, char ***out_instances) {
+ _cleanup_strv_free_ char **instances = NULL;
+ _cleanup_free_ char *unit_name = NULL;
+ int r;
+
+ assert(pattern);
+ assert(out_instances);
+ assert(out_unit_name);
+
+ r = extract_first_word(&pattern, &unit_name, NULL, EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return r;
+
+ /* We handle the instances logic when unit name is extracted */
+ if (pattern) {
+ /* We only create instances when a rule of templated unit
+ * is seen. A rule like enable foo@.service a b c will
+ * result in an array of (a, b, c) as instance names */
+ if (!unit_name_is_valid(unit_name, UNIT_NAME_TEMPLATE))
+ return -EINVAL;
+
+ instances = strv_split(pattern, WHITESPACE);
+ if (!instances)
+ return -ENOMEM;
+
+ *out_instances = TAKE_PTR(instances);
+ }
+
+ *out_unit_name = TAKE_PTR(unit_name);
+
+ return 0;
+}
+
+static int presets_find_config(RuntimeScope scope, const char *root_dir, char ***files) {
+ static const char* const system_dirs[] = {CONF_PATHS("systemd/system-preset"), NULL};
+ static const char* const user_dirs[] = {CONF_PATHS_USR("systemd/user-preset"), NULL};
+ const char* const* dirs;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+
+ if (scope == RUNTIME_SCOPE_SYSTEM)
+ dirs = system_dirs;
+ else if (IN_SET(scope, RUNTIME_SCOPE_GLOBAL, RUNTIME_SCOPE_USER))
+ dirs = user_dirs;
+ else
+ assert_not_reached();
+
+ return conf_files_list_strv(files, ".preset", root_dir, 0, dirs);
+}
+
+static int read_presets(RuntimeScope scope, const char *root_dir, UnitFilePresets *presets) {
+ _cleanup_(unit_file_presets_done) UnitFilePresets ps = {};
+ _cleanup_strv_free_ char **files = NULL;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(presets);
+
+ r = presets_find_config(scope, root_dir, &files);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(p, files) {
+ _cleanup_fclose_ FILE *f = NULL;
+ int n = 0;
+
+ f = fopen(*p, "re");
+ if (!f) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+ _cleanup_(unit_file_preset_rule_done) UnitFilePresetRule rule = {};
+ const char *parameter;
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ n++;
+
+ if (isempty(line))
+ continue;
+ if (strchr(COMMENTS, line[0]))
+ continue;
+
+ parameter = first_word(line, "enable");
+ if (parameter) {
+ char *unit_name;
+ char **instances = NULL;
+
+ /* Unit_name will remain the same as parameter when no instances are specified */
+ r = split_pattern_into_name_and_instances(parameter, &unit_name, &instances);
+ if (r < 0) {
+ log_syntax(NULL, LOG_WARNING, *p, n, r, "Couldn't parse line '%s'. Ignoring.", line);
+ continue;
+ }
+
+ rule = (UnitFilePresetRule) {
+ .pattern = unit_name,
+ .action = PRESET_ENABLE,
+ .instances = instances,
+ };
+ }
+
+ parameter = first_word(line, "disable");
+ if (parameter) {
+ char *pattern;
+
+ pattern = strdup(parameter);
+ if (!pattern)
+ return -ENOMEM;
+
+ rule = (UnitFilePresetRule) {
+ .pattern = pattern,
+ .action = PRESET_DISABLE,
+ };
+ }
+
+ parameter = first_word(line, "ignore");
+ if (parameter) {
+ char *pattern;
+
+ pattern = strdup(parameter);
+ if (!pattern)
+ return -ENOMEM;
+
+ rule = (UnitFilePresetRule) {
+ .pattern = pattern,
+ .action = PRESET_IGNORE,
+ };
+ }
+
+ if (rule.action) {
+ if (!GREEDY_REALLOC(ps.rules, ps.n_rules + 1))
+ return -ENOMEM;
+
+ ps.rules[ps.n_rules++] = TAKE_STRUCT(rule);
+ continue;
+ }
+
+ log_syntax(NULL, LOG_WARNING, *p, n, 0, "Couldn't parse line '%s'. Ignoring.", line);
+ }
+ }
+
+ ps.initialized = true;
+ *presets = TAKE_STRUCT(ps);
+
+ return 0;
+}
+
+static int pattern_match_multiple_instances(
+ const UnitFilePresetRule rule,
+ const char *unit_name,
+ char ***ret) {
+
+ _cleanup_free_ char *templated_name = NULL;
+ int r;
+
+ /* If no ret is needed or the rule itself does not have instances
+ * initialized, we return not matching */
+ if (!ret || !rule.instances)
+ return 0;
+
+ r = unit_name_template(unit_name, &templated_name);
+ if (r < 0)
+ return r;
+ if (!streq(rule.pattern, templated_name))
+ return 0;
+
+ /* Compose a list of specified instances when unit name is a template */
+ if (unit_name_is_valid(unit_name, UNIT_NAME_TEMPLATE)) {
+ _cleanup_strv_free_ char **out_strv = NULL;
+
+ STRV_FOREACH(iter, rule.instances) {
+ _cleanup_free_ char *name = NULL;
+
+ r = unit_name_replace_instance(unit_name, *iter, &name);
+ if (r < 0)
+ return r;
+
+ r = strv_consume(&out_strv, TAKE_PTR(name));
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(out_strv);
+ return 1;
+ } else {
+ /* We now know the input unit name is an instance name */
+ _cleanup_free_ char *instance_name = NULL;
+
+ r = unit_name_to_instance(unit_name, &instance_name);
+ if (r < 0)
+ return r;
+
+ if (strv_find(rule.instances, instance_name))
+ return 1;
+ }
+ return 0;
+}
+
+static int query_presets(const char *name, const UnitFilePresets *presets, char ***instance_name_list) {
+ PresetAction action = PRESET_UNKNOWN;
+
+ if (!unit_name_is_valid(name, UNIT_NAME_ANY))
+ return -EINVAL;
+
+ for (size_t i = 0; i < presets->n_rules; i++)
+ if (pattern_match_multiple_instances(presets->rules[i], name, instance_name_list) > 0 ||
+ fnmatch(presets->rules[i].pattern, name, FNM_NOESCAPE) == 0) {
+ action = presets->rules[i].action;
+ break;
+ }
+
+ switch (action) {
+ case PRESET_UNKNOWN:
+ log_debug("Preset files don't specify rule for %s. Enabling.", name);
+ return PRESET_ENABLE;
+ case PRESET_ENABLE:
+ if (instance_name_list && *instance_name_list)
+ STRV_FOREACH(s, *instance_name_list)
+ log_debug("Preset files say enable %s.", *s);
+ else
+ log_debug("Preset files say enable %s.", name);
+ return PRESET_ENABLE;
+ case PRESET_DISABLE:
+ log_debug("Preset files say disable %s.", name);
+ return PRESET_DISABLE;
+ case PRESET_IGNORE:
+ log_debug("Preset files say ignore %s.", name);
+ return PRESET_IGNORE;
+ default:
+ assert_not_reached();
+ }
+}
+
+PresetAction unit_file_query_preset(RuntimeScope scope, const char *root_dir, const char *name, UnitFilePresets *cached) {
+ _cleanup_(unit_file_presets_done) UnitFilePresets tmp = {};
+ int r;
+
+ if (!cached)
+ cached = &tmp;
+ if (!cached->initialized) {
+ r = read_presets(scope, root_dir, cached);
+ if (r < 0)
+ return r;
+ }
+
+ return query_presets(name, cached, NULL);
+}
+
+static int execute_preset(
+ UnitFileFlags file_flags,
+ InstallContext *plus,
+ InstallContext *minus,
+ const LookupPaths *lp,
+ const char *config_path,
+ char **files,
+ UnitFilePresetMode mode,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ int r;
+
+ assert(plus);
+ assert(minus);
+ assert(lp);
+ assert(config_path);
+
+ if (mode != UNIT_FILE_PRESET_ENABLE_ONLY) {
+ _cleanup_set_free_free_ Set *remove_symlinks_to = NULL;
+
+ r = install_context_mark_for_removal(minus, lp, &remove_symlinks_to, config_path, changes, n_changes);
+ if (r < 0)
+ return r;
+
+ r = remove_marked_symlinks(remove_symlinks_to, config_path, lp, false, changes, n_changes);
+ } else
+ r = 0;
+
+ if (mode != UNIT_FILE_PRESET_DISABLE_ONLY) {
+ int q;
+
+ /* Returns number of symlinks that where supposed to be installed. */
+ q = install_context_apply(plus, lp,
+ file_flags | UNIT_FILE_IGNORE_AUXILIARY_FAILURE,
+ config_path,
+ SEARCH_LOAD, changes, n_changes);
+ if (r >= 0) {
+ if (q < 0)
+ r = q;
+ else
+ r += q;
+ }
+ }
+
+ return r;
+}
+
+static int preset_prepare_one(
+ RuntimeScope scope,
+ InstallContext *plus,
+ InstallContext *minus,
+ LookupPaths *lp,
+ const char *name,
+ const UnitFilePresets *presets,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(install_context_done) InstallContext tmp = { .scope = scope };
+ _cleanup_strv_free_ char **instance_name_list = NULL;
+ InstallInfo *info;
+ int r;
+
+ if (install_info_find(plus, name) || install_info_find(minus, name))
+ return 0;
+
+ r = install_info_discover(&tmp, lp, name, SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, changes, n_changes);
+ if (r < 0)
+ return r;
+ if (!streq(name, info->name)) {
+ log_debug("Skipping %s because it is an alias for %s.", name, info->name);
+ return 0;
+ }
+
+ r = query_presets(name, presets, &instance_name_list);
+ if (r < 0)
+ return r;
+
+ if (r == PRESET_ENABLE) {
+ if (instance_name_list)
+ STRV_FOREACH(s, instance_name_list) {
+ r = install_info_discover_and_check(plus, lp, *s, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, changes, n_changes);
+ if (r < 0)
+ return r;
+ }
+ else {
+ r = install_info_discover_and_check(plus, lp, name, SEARCH_LOAD|SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, changes, n_changes);
+ if (r < 0)
+ return r;
+ }
+
+ } else if (r == PRESET_DISABLE)
+ r = install_info_discover(minus, lp, name, SEARCH_FOLLOW_CONFIG_SYMLINKS,
+ &info, changes, n_changes);
+
+ return r;
+}
+
+int unit_file_preset(
+ RuntimeScope scope,
+ UnitFileFlags file_flags,
+ const char *root_dir,
+ char **names,
+ UnitFilePresetMode mode,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(install_context_done) InstallContext plus = {}, minus = {};
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_(unit_file_presets_done) UnitFilePresets presets = {};
+ const char *config_path;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(mode < _UNIT_FILE_PRESET_MODE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ config_path = (file_flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config;
+ if (!config_path)
+ return -ENXIO;
+
+ r = read_presets(scope, root_dir, &presets);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(name, names) {
+ r = preset_prepare_one(scope, &plus, &minus, &lp, *name, &presets, changes, n_changes);
+ if (r < 0)
+ return r;
+ }
+
+ return execute_preset(file_flags, &plus, &minus, &lp, config_path, names, mode, changes, n_changes);
+}
+
+int unit_file_preset_all(
+ RuntimeScope scope,
+ UnitFileFlags file_flags,
+ const char *root_dir,
+ UnitFilePresetMode mode,
+ InstallChange **changes,
+ size_t *n_changes) {
+
+ _cleanup_(install_context_done) InstallContext plus = {}, minus = {};
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ _cleanup_(unit_file_presets_done) UnitFilePresets presets = {};
+ const char *config_path = NULL;
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(mode < _UNIT_FILE_PRESET_MODE_MAX);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ config_path = (file_flags & UNIT_FILE_RUNTIME) ? lp.runtime_config : lp.persistent_config;
+ if (!config_path)
+ return -ENXIO;
+
+ r = read_presets(scope, root_dir, &presets);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(i, lp.search_path) {
+ _cleanup_closedir_ DIR *d = NULL;
+
+ d = opendir(*i);
+ if (!d) {
+ if (errno == ENOENT)
+ continue;
+
+ return -errno;
+ }
+
+ FOREACH_DIRENT(de, d, return -errno) {
+
+ if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY))
+ continue;
+
+ if (!IN_SET(de->d_type, DT_LNK, DT_REG))
+ continue;
+
+ r = preset_prepare_one(scope, &plus, &minus, &lp, de->d_name, &presets, changes, n_changes);
+ if (r < 0 &&
+ !IN_SET(r, -EEXIST, -ERFKILL, -EADDRNOTAVAIL, -EBADSLT, -EIDRM, -EUCLEAN, -ELOOP, -ENOENT, -EUNATCH, -EXDEV))
+ /* Ignore generated/transient/missing/invalid units when applying preset, propagate other errors.
+ * Coordinate with install_changes_dump() above. */
+ return r;
+ }
+ }
+
+ return execute_preset(file_flags, &plus, &minus, &lp, config_path, NULL, mode, changes, n_changes);
+}
+
+static UnitFileList* unit_file_list_free(UnitFileList *f) {
+ if (!f)
+ return NULL;
+
+ free(f->path);
+ return mfree(f);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(UnitFileList*, unit_file_list_free);
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+ unit_file_list_hash_ops_free,
+ char,
+ string_hash_func,
+ string_compare_func,
+ UnitFileList,
+ unit_file_list_free);
+
+int unit_file_get_list(
+ RuntimeScope scope,
+ const char *root_dir,
+ Hashmap *h,
+ char **states,
+ char **patterns) {
+
+ _cleanup_(lookup_paths_free) LookupPaths lp = {};
+ int r;
+
+ assert(scope >= 0);
+ assert(scope < _RUNTIME_SCOPE_MAX);
+ assert(h);
+
+ r = lookup_paths_init(&lp, scope, 0, root_dir);
+ if (r < 0)
+ return r;
+
+ STRV_FOREACH(dirname, lp.search_path) {
+ _cleanup_closedir_ DIR *d = NULL;
+
+ d = opendir(*dirname);
+ if (!d) {
+ if (errno == ENOENT)
+ continue;
+ if (IN_SET(errno, ENOTDIR, EACCES)) {
+ log_debug_errno(errno, "Failed to open \"%s\": %m", *dirname);
+ continue;
+ }
+
+ return -errno;
+ }
+
+ FOREACH_DIRENT(de, d, return -errno) {
+ _cleanup_(unit_file_list_freep) UnitFileList *f = NULL;
+
+ if (!unit_name_is_valid(de->d_name, UNIT_NAME_ANY))
+ continue;
+
+ if (!strv_fnmatch_or_empty(patterns, de->d_name, FNM_NOESCAPE))
+ continue;
+
+ if (hashmap_get(h, de->d_name))
+ continue;
+
+ if (!IN_SET(de->d_type, DT_LNK, DT_REG))
+ continue;
+
+ f = new0(UnitFileList, 1);
+ if (!f)
+ return -ENOMEM;
+
+ f->path = path_make_absolute(de->d_name, *dirname);
+ if (!f->path)
+ return -ENOMEM;
+
+ r = unit_file_lookup_state(scope, &lp, de->d_name, &f->state);
+ if (r < 0)
+ f->state = UNIT_FILE_BAD;
+
+ if (!strv_isempty(states) &&
+ !strv_contains(states, unit_file_state_to_string(f->state)))
+ continue;
+
+ r = hashmap_put(h, basename(f->path), f);
+ if (r < 0)
+ return r;
+
+ f = NULL; /* prevent cleanup */
+ }
+ }
+
+ return 0;
+}
+
+static const char* const unit_file_state_table[_UNIT_FILE_STATE_MAX] = {
+ [UNIT_FILE_ENABLED] = "enabled",
+ [UNIT_FILE_ENABLED_RUNTIME] = "enabled-runtime",
+ [UNIT_FILE_LINKED] = "linked",
+ [UNIT_FILE_LINKED_RUNTIME] = "linked-runtime",
+ [UNIT_FILE_ALIAS] = "alias",
+ [UNIT_FILE_MASKED] = "masked",
+ [UNIT_FILE_MASKED_RUNTIME] = "masked-runtime",
+ [UNIT_FILE_STATIC] = "static",
+ [UNIT_FILE_DISABLED] = "disabled",
+ [UNIT_FILE_INDIRECT] = "indirect",
+ [UNIT_FILE_GENERATED] = "generated",
+ [UNIT_FILE_TRANSIENT] = "transient",
+ [UNIT_FILE_BAD] = "bad",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_file_state, UnitFileState);
+
+static const char* const install_change_type_table[_INSTALL_CHANGE_TYPE_MAX] = {
+ [INSTALL_CHANGE_SYMLINK] = "symlink",
+ [INSTALL_CHANGE_UNLINK] = "unlink",
+ [INSTALL_CHANGE_IS_MASKED] = "masked",
+ [INSTALL_CHANGE_IS_MASKED_GENERATOR] = "masked by generator",
+ [INSTALL_CHANGE_IS_DANGLING] = "dangling",
+ [INSTALL_CHANGE_DESTINATION_NOT_PRESENT] = "destination not present",
+ [INSTALL_CHANGE_AUXILIARY_FAILED] = "auxiliary unit failed",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(install_change_type, InstallChangeType);
+
+static const char* const unit_file_preset_mode_table[_UNIT_FILE_PRESET_MODE_MAX] = {
+ [UNIT_FILE_PRESET_FULL] = "full",
+ [UNIT_FILE_PRESET_ENABLE_ONLY] = "enable-only",
+ [UNIT_FILE_PRESET_DISABLE_ONLY] = "disable-only",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(unit_file_preset_mode, UnitFilePresetMode);
diff --git a/src/shared/install.h b/src/shared/install.h
new file mode 100644
index 0000000..bc0c6db
--- /dev/null
+++ b/src/shared/install.h
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+typedef enum UnitFilePresetMode UnitFilePresetMode;
+typedef enum InstallChangeType InstallChangeType;
+typedef enum UnitFileFlags UnitFileFlags;
+typedef enum InstallMode InstallMode;
+typedef struct InstallChange InstallChange;
+typedef struct UnitFileList UnitFileList;
+typedef struct InstallInfo InstallInfo;
+
+#include "hashmap.h"
+#include "macro.h"
+#include "path-lookup.h"
+#include "strv.h"
+#include "unit-file.h"
+#include "unit-name.h"
+
+enum UnitFilePresetMode {
+ UNIT_FILE_PRESET_FULL,
+ UNIT_FILE_PRESET_ENABLE_ONLY,
+ UNIT_FILE_PRESET_DISABLE_ONLY,
+ _UNIT_FILE_PRESET_MODE_MAX,
+ _UNIT_FILE_PRESET_MODE_INVALID = -EINVAL,
+};
+
+enum InstallChangeType {
+ INSTALL_CHANGE_SYMLINK,
+ INSTALL_CHANGE_UNLINK,
+ INSTALL_CHANGE_IS_MASKED,
+ INSTALL_CHANGE_IS_MASKED_GENERATOR,
+ INSTALL_CHANGE_IS_DANGLING,
+ INSTALL_CHANGE_DESTINATION_NOT_PRESENT,
+ INSTALL_CHANGE_AUXILIARY_FAILED,
+ _INSTALL_CHANGE_TYPE_MAX,
+ _INSTALL_CHANGE_INVALID = -EINVAL,
+ _INSTALL_CHANGE_ERRNO_MAX = -ERRNO_MAX, /* Ensure this type covers the whole negative errno range */
+};
+
+static inline bool INSTALL_CHANGE_TYPE_VALID(InstallChangeType t) {
+ return t >= _INSTALL_CHANGE_ERRNO_MAX && t < _INSTALL_CHANGE_TYPE_MAX;
+}
+
+enum UnitFileFlags {
+ UNIT_FILE_RUNTIME = 1 << 0, /* Public API via DBUS, do not change */
+ UNIT_FILE_FORCE = 1 << 1, /* Public API via DBUS, do not change */
+ UNIT_FILE_PORTABLE = 1 << 2, /* Public API via DBUS, do not change */
+ UNIT_FILE_DRY_RUN = 1 << 3,
+ UNIT_FILE_IGNORE_AUXILIARY_FAILURE = 1 << 4,
+ _UNIT_FILE_FLAGS_MASK_PUBLIC = UNIT_FILE_RUNTIME|UNIT_FILE_PORTABLE|UNIT_FILE_FORCE,
+};
+
+/* type can be either one of the INSTALL_CHANGE_SYMLINK, INSTALL_CHANGE_UNLINK, … listed above, or a negative
+ * errno value.
+ *
+ * If source is specified, it should be the contents of the path symlink. In case of an error, source should
+ * be the existing symlink contents or NULL. */
+struct InstallChange {
+ int type; /* INSTALL_CHANGE_SYMLINK, … if positive, errno if negative */
+ char *path;
+ char *source;
+};
+
+static inline bool install_changes_have_modification(const InstallChange* changes, size_t n_changes) {
+ for (size_t i = 0; i < n_changes; i++)
+ if (IN_SET(changes[i].type, INSTALL_CHANGE_SYMLINK, INSTALL_CHANGE_UNLINK))
+ return true;
+ return false;
+}
+
+struct UnitFileList {
+ char *path;
+ UnitFileState state;
+};
+
+enum InstallMode {
+ INSTALL_MODE_REGULAR,
+ INSTALL_MODE_LINKED,
+ INSTALL_MODE_ALIAS,
+ INSTALL_MODE_MASKED,
+ _INSTALL_MODE_MAX,
+ _INSTALL_MODE_INVALID = -EINVAL,
+};
+
+struct InstallInfo {
+ char *name;
+ char *path;
+ char *root;
+
+ char **aliases;
+ char **wanted_by;
+ char **required_by;
+ char **upheld_by;
+ char **also;
+
+ char *default_instance;
+ char *symlink_target;
+
+ InstallMode install_mode;
+ bool auxiliary;
+};
+
+int unit_file_enable(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names_or_paths,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_disable(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_reenable(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names_or_paths,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_preset(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ UnitFilePresetMode mode,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_preset_all(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ UnitFilePresetMode mode,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_mask(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_unmask(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_link(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **files,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_revert(
+ RuntimeScope scope,
+ const char *root_dir,
+ char **names,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_set_default(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ const char *name,
+ InstallChange **changes,
+ size_t *n_changes);
+int unit_file_get_default(
+ RuntimeScope scope,
+ const char *root_dir,
+ char **name);
+int unit_file_add_dependency(
+ RuntimeScope scope,
+ UnitFileFlags flags,
+ const char *root_dir,
+ char **names,
+ const char *target,
+ UnitDependency dep,
+ InstallChange **changes,
+ size_t *n_changes);
+
+int unit_file_lookup_state(
+ RuntimeScope scope,
+ const LookupPaths *paths,
+ const char *name,
+ UnitFileState *ret);
+
+int unit_file_get_state(RuntimeScope scope, const char *root_dir, const char *filename, UnitFileState *ret);
+int unit_file_exists(RuntimeScope scope, const LookupPaths *paths, const char *name);
+
+int unit_file_get_list(RuntimeScope scope, const char *root_dir, Hashmap *h, char **states, char **patterns);
+
+extern const struct hash_ops unit_file_list_hash_ops_free;
+
+InstallChangeType install_changes_add(InstallChange **changes, size_t *n_changes, InstallChangeType type, const char *path, const char *source);
+void install_changes_free(InstallChange *changes, size_t n_changes);
+void install_changes_dump(int r, const char *verb, const InstallChange *changes, size_t n_changes, bool quiet);
+
+int unit_file_verify_alias(
+ const InstallInfo *info,
+ const char *dst,
+ char **ret_dst,
+ InstallChange **changes,
+ size_t *n_changes);
+
+typedef struct UnitFilePresetRule UnitFilePresetRule;
+
+typedef struct {
+ UnitFilePresetRule *rules;
+ size_t n_rules;
+ bool initialized;
+} UnitFilePresets;
+
+typedef enum PresetAction {
+ PRESET_UNKNOWN,
+ PRESET_ENABLE,
+ PRESET_DISABLE,
+ PRESET_IGNORE,
+ _PRESET_ACTION_MAX,
+ _PRESET_ACTION_INVALID = -EINVAL,
+ _PRESET_ACTION_ERRNO_MAX = -ERRNO_MAX, /* Ensure this type covers the whole negative errno range */
+} PresetAction;
+
+const char *preset_action_past_tense_to_string(PresetAction action);
+
+void unit_file_presets_done(UnitFilePresets *p);
+PresetAction unit_file_query_preset(RuntimeScope scope, const char *root_dir, const char *name, UnitFilePresets *cached);
+
+const char *unit_file_state_to_string(UnitFileState s) _const_;
+UnitFileState unit_file_state_from_string(const char *s) _pure_;
+/* from_string conversion is unreliable because of the overlap between -EPERM and -1 for error. */
+
+const char *install_change_type_to_string(InstallChangeType t) _const_;
+InstallChangeType install_change_type_from_string(const char *s) _pure_;
+
+const char *unit_file_preset_mode_to_string(UnitFilePresetMode m) _const_;
+UnitFilePresetMode unit_file_preset_mode_from_string(const char *s) _pure_;
diff --git a/src/shared/ip-protocol-list.c b/src/shared/ip-protocol-list.c
new file mode 100644
index 0000000..14155b6
--- /dev/null
+++ b/src/shared/ip-protocol-list.c
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <netinet/in.h>
+
+#include "alloc-util.h"
+#include "ip-protocol-list.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "string-util.h"
+
+static const struct ip_protocol_name* lookup_ip_protocol(register const char *str, register GPERF_LEN_TYPE len);
+
+#include "ip-protocol-from-name.h"
+#include "ip-protocol-to-name.h"
+
+const char *ip_protocol_to_name(int id) {
+
+ if (id < 0)
+ return NULL;
+
+ if ((size_t) id >= ELEMENTSOF(ip_protocol_names))
+ return NULL;
+
+ return ip_protocol_names[id];
+}
+
+int ip_protocol_from_name(const char *name) {
+ const struct ip_protocol_name *sc;
+
+ assert(name);
+
+ sc = lookup_ip_protocol(name, strlen(name));
+ if (!sc)
+ return -EINVAL;
+
+ return sc->id;
+}
+
+int parse_ip_protocol_full(const char *s, bool relaxed) {
+ int r, p;
+
+ assert(s);
+
+ if (isempty(s))
+ return IPPROTO_IP;
+
+ /* People commonly use lowercase protocol names, which we can look up very quickly, so let's try that
+ * first. */
+ r = ip_protocol_from_name(s);
+ if (r >= 0)
+ return r;
+
+ /* Do not use strdupa() here, as the input string may come from command line or config files. */
+ _cleanup_free_ char *t = strdup(s);
+ if (!t)
+ return -ENOMEM;
+
+ r = ip_protocol_from_name(ascii_strlower(t));
+ if (r >= 0)
+ return r;
+
+ r = safe_atoi(t, &p);
+ if (r < 0)
+ return r;
+ if (p < 0)
+ return -ERANGE;
+
+ /* If @relaxed, we don't check that we have a name for the protocol. */
+ if (!relaxed && !ip_protocol_to_name(p))
+ return -EPROTONOSUPPORT;
+
+ return p;
+}
+
+const char *ip_protocol_to_tcp_udp(int id) {
+ return IN_SET(id, IPPROTO_TCP, IPPROTO_UDP) ?
+ ip_protocol_to_name(id) : NULL;
+}
+
+int ip_protocol_from_tcp_udp(const char *ip_protocol) {
+ int id = ip_protocol_from_name(ip_protocol);
+ return IN_SET(id, IPPROTO_TCP, IPPROTO_UDP) ? id : -EINVAL;
+}
diff --git a/src/shared/ip-protocol-list.h b/src/shared/ip-protocol-list.h
new file mode 100644
index 0000000..a0875ef
--- /dev/null
+++ b/src/shared/ip-protocol-list.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+const char *ip_protocol_to_name(int id);
+int ip_protocol_from_name(const char *name);
+int parse_ip_protocol_full(const char *s, bool relaxed);
+static inline int parse_ip_protocol(const char *s) {
+ return parse_ip_protocol_full(s, false);
+}
+
+const char *ip_protocol_to_tcp_udp(int id);
+int ip_protocol_from_tcp_udp(const char *ip_protocol);
diff --git a/src/shared/ip-protocol-to-name.awk b/src/shared/ip-protocol-to-name.awk
new file mode 100644
index 0000000..a0671e7
--- /dev/null
+++ b/src/shared/ip-protocol-to-name.awk
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+BEGIN{
+ print "static const char* const ip_protocol_names[] = { "
+}
+!/HOPOPTS/ {
+ printf " [IPPROTO_%s] = \"%s\",\n", $1, tolower($1)
+}
+END{
+ print "};"
+}
diff --git a/src/shared/ipvlan-util.c b/src/shared/ipvlan-util.c
new file mode 100644
index 0000000..1f2e2ff
--- /dev/null
+++ b/src/shared/ipvlan-util.c
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <net/if.h>
+
+#include "ipvlan-util.h"
+#include "string-table.h"
+
+static const char* const ipvlan_mode_table[_NETDEV_IPVLAN_MODE_MAX] = {
+ [NETDEV_IPVLAN_MODE_L2] = "L2",
+ [NETDEV_IPVLAN_MODE_L3] = "L3",
+ [NETDEV_IPVLAN_MODE_L3S] = "L3S",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(ipvlan_mode, IPVlanMode);
+
+static const char* const ipvlan_flags_table[_NETDEV_IPVLAN_FLAGS_MAX] = {
+ [NETDEV_IPVLAN_FLAGS_BRIGDE] = "bridge",
+ [NETDEV_IPVLAN_FLAGS_PRIVATE] = "private",
+ [NETDEV_IPVLAN_FLAGS_VEPA] = "vepa",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(ipvlan_flags, IPVlanFlags);
diff --git a/src/shared/ipvlan-util.h b/src/shared/ipvlan-util.h
new file mode 100644
index 0000000..a475b37
--- /dev/null
+++ b/src/shared/ipvlan-util.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <netinet/in.h>
+#include <linux/if_link.h>
+
+#include "macro.h"
+
+typedef enum IPVlanMode {
+ NETDEV_IPVLAN_MODE_L2 = IPVLAN_MODE_L2,
+ NETDEV_IPVLAN_MODE_L3 = IPVLAN_MODE_L3,
+ NETDEV_IPVLAN_MODE_L3S = IPVLAN_MODE_L3S,
+ _NETDEV_IPVLAN_MODE_MAX,
+ _NETDEV_IPVLAN_MODE_INVALID = -EINVAL,
+} IPVlanMode;
+
+typedef enum IPVlanFlags {
+ NETDEV_IPVLAN_FLAGS_BRIGDE,
+ NETDEV_IPVLAN_FLAGS_PRIVATE = IPVLAN_F_PRIVATE,
+ NETDEV_IPVLAN_FLAGS_VEPA = IPVLAN_F_VEPA,
+ _NETDEV_IPVLAN_FLAGS_MAX,
+ _NETDEV_IPVLAN_FLAGS_INVALID = -EINVAL,
+} IPVlanFlags;
+
+const char *ipvlan_mode_to_string(IPVlanMode d) _const_;
+IPVlanMode ipvlan_mode_from_string(const char *d) _pure_;
+
+const char *ipvlan_flags_to_string(IPVlanFlags d) _const_;
+IPVlanFlags ipvlan_flags_from_string(const char *d) _pure_;
diff --git a/src/shared/journal-file-util.c b/src/shared/journal-file-util.c
new file mode 100644
index 0000000..e444a2b
--- /dev/null
+++ b/src/shared/journal-file-util.c
@@ -0,0 +1,534 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <pthread.h>
+#include <unistd.h>
+
+#include "chattr-util.h"
+#include "copy.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "journal-authenticate.h"
+#include "journal-file-util.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "set.h"
+#include "stat-util.h"
+#include "sync-util.h"
+
+#define PAYLOAD_BUFFER_SIZE (16U * 1024U)
+#define MINIMUM_HOLE_SIZE (1U * 1024U * 1024U / 2U)
+
+static int journal_file_end_punch_hole(JournalFile *f) {
+ uint64_t p, sz;
+ int r;
+
+ r = journal_file_tail_end_by_pread(f, &p);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine end of tail object: %m");
+
+ assert(p <= (uint64_t) f->last_stat.st_size);
+
+ sz = ((uint64_t) f->last_stat.st_size) - p;
+ if (sz < MINIMUM_HOLE_SIZE)
+ return 0;
+
+ if (fallocate(f->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, p, sz) < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(errno))
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), /* Make recognizable */
+ "Hole punching not supported by backing file system, skipping.");
+
+ return log_debug_errno(errno, "Failed to punch hole at end of journal file %s: %m", f->path);
+ }
+
+ return 0;
+}
+
+static int journal_file_entry_array_punch_hole(JournalFile *f, uint64_t p, uint64_t n_entries) {
+ Object o;
+ uint64_t offset, sz, n_items = 0, n_unused;
+ int r;
+
+ if (n_entries == 0)
+ return 0;
+
+ for (uint64_t q = p; q != 0; q = le64toh(o.entry_array.next_entry_array_offset)) {
+ r = journal_file_read_object_header(f, OBJECT_ENTRY_ARRAY, q, &o);
+ if (r < 0)
+ return r;
+
+ n_items += journal_file_entry_array_n_items(f, &o);
+ p = q;
+ }
+
+ if (p == 0)
+ return 0;
+
+ if (n_entries > n_items)
+ return -EBADMSG;
+
+ /* Amount of unused items in the final entry array. */
+ n_unused = n_items - n_entries;
+
+ if (n_unused == 0)
+ return 0;
+
+ offset = p + offsetof(Object, entry_array.items) +
+ (journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);
+ sz = p + le64toh(o.object.size) - offset;
+
+ if (sz < MINIMUM_HOLE_SIZE)
+ return 0;
+
+ if (fallocate(f->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, sz) < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(errno))
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), /* Make recognizable */
+ "Hole punching not supported by backing file system, skipping.");
+
+ return log_debug_errno(errno, "Failed to punch hole in entry array of %s: %m", f->path);
+ }
+
+ return 0;
+}
+
+static int journal_file_punch_holes(JournalFile *f) {
+ HashItem items[PAYLOAD_BUFFER_SIZE / sizeof(HashItem)];
+ uint64_t p, sz;
+ ssize_t n = SSIZE_MAX;
+ int r;
+
+ r = journal_file_entry_array_punch_hole(
+ f, le64toh(f->header->entry_array_offset), le64toh(f->header->n_entries));
+ if (r < 0)
+ return r;
+
+ p = le64toh(f->header->data_hash_table_offset);
+ sz = le64toh(f->header->data_hash_table_size);
+
+ for (uint64_t i = p; i < p + sz && n > 0; i += n) {
+ size_t m = MIN(sizeof(items), p + sz - i);
+ n = pread(f->fd, items, m, i);
+ if (n < 0)
+ return log_debug_errno(errno, "Failed to read hash table items: %m");
+
+ /* Let's ignore any partial hash items by rounding down to the nearest multiple of HashItem. */
+ n -= n % sizeof(HashItem);
+
+ for (size_t j = 0; j < (size_t) n / sizeof(HashItem); j++) {
+ Object o;
+
+ for (uint64_t q = le64toh(items[j].head_hash_offset); q != 0;
+ q = le64toh(o.data.next_hash_offset)) {
+
+ r = journal_file_read_object_header(f, OBJECT_DATA, q, &o);
+ if (r < 0) {
+ log_debug_errno(r, "Invalid data object: %m, ignoring");
+ break;
+ }
+
+ if (le64toh(o.data.n_entries) == 0)
+ continue;
+
+ r = journal_file_entry_array_punch_hole(
+ f, le64toh(o.data.entry_array_offset), le64toh(o.data.n_entries) - 1);
+ if (r == -EOPNOTSUPP)
+ return -EOPNOTSUPP;
+
+ /* Ignore other errors */
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
+ * As a result we use atomic operations on f->offline_state for inter-thread communications with
+ * journal_file_set_offline() and journal_file_set_online(). */
+static void journal_file_set_offline_internal(JournalFile *f) {
+ int r;
+
+ assert(f);
+ assert(f->fd >= 0);
+ assert(f->header);
+
+ for (;;) {
+ switch (f->offline_state) {
+ case OFFLINE_CANCEL: {
+ OfflineState tmp_state = OFFLINE_CANCEL;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_DONE,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ return;
+
+ case OFFLINE_AGAIN_FROM_SYNCING: {
+ OfflineState tmp_state = OFFLINE_AGAIN_FROM_SYNCING;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_SYNCING,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ break;
+
+ case OFFLINE_AGAIN_FROM_OFFLINING: {
+ OfflineState tmp_state = OFFLINE_AGAIN_FROM_OFFLINING;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_SYNCING,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ break;
+
+ case OFFLINE_SYNCING:
+ if (f->archive) {
+ (void) journal_file_end_punch_hole(f);
+ (void) journal_file_punch_holes(f);
+ }
+
+ (void) fsync(f->fd);
+
+ {
+ OfflineState tmp_state = OFFLINE_SYNCING;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_OFFLINING,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+
+ f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
+ (void) fsync(f->fd);
+
+ /* If we've archived the journal file, first try to re-enable COW on the file. If the
+ * FS_NOCOW_FL flag was never set or we successfully removed it, continue. If we fail
+ * to remove the flag on the archived file, rewrite the file without the NOCOW flag.
+ * We need this fallback because on some filesystems (BTRFS), the NOCOW flag cannot
+ * be removed after data has been written to a file. The only way to remove it is to
+ * copy all data to a new file without the NOCOW flag set. */
+
+ if (f->archive) {
+ r = chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
+ if (r >= 0)
+ continue;
+
+ log_debug_errno(r, "Failed to re-enable copy-on-write for %s: %m, rewriting file", f->path);
+
+ r = copy_file_atomic_full(FORMAT_PROC_FD_PATH(f->fd), f->path, f->mode,
+ 0,
+ FS_NOCOW_FL,
+ COPY_REPLACE | COPY_FSYNC | COPY_HOLES | COPY_ALL_XATTRS,
+ NULL, NULL);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to rewrite %s: %m", f->path);
+ continue;
+ }
+ }
+
+ break;
+
+ case OFFLINE_OFFLINING: {
+ OfflineState tmp_state = OFFLINE_OFFLINING;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_DONE,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ _fallthrough_;
+ case OFFLINE_DONE:
+ return;
+
+ case OFFLINE_JOINED:
+ log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
+ return;
+ }
+ }
+}
+
+static void * journal_file_set_offline_thread(void *arg) {
+ JournalFile *f = arg;
+
+ (void) pthread_setname_np(pthread_self(), "journal-offline");
+
+ journal_file_set_offline_internal(f);
+
+ return NULL;
+}
+
+/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
+static bool journal_file_set_offline_try_restart(JournalFile *f) {
+ for (;;) {
+ switch (f->offline_state) {
+ case OFFLINE_AGAIN_FROM_SYNCING:
+ case OFFLINE_AGAIN_FROM_OFFLINING:
+ return true;
+
+ case OFFLINE_CANCEL: {
+ OfflineState tmp_state = OFFLINE_CANCEL;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_AGAIN_FROM_SYNCING,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ return true;
+
+ case OFFLINE_SYNCING: {
+ OfflineState tmp_state = OFFLINE_SYNCING;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_AGAIN_FROM_SYNCING,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ return true;
+
+ case OFFLINE_OFFLINING: {
+ OfflineState tmp_state = OFFLINE_OFFLINING;
+ if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_AGAIN_FROM_OFFLINING,
+ false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST))
+ continue;
+ }
+ return true;
+
+ default:
+ return false;
+ }
+ }
+}
+
+/* Sets a journal offline.
+ *
+ * If wait is false then an offline is dispatched in a separate thread for a
+ * subsequent journal_file_set_offline() or journal_file_set_online() of the
+ * same journal to synchronize with.
+ *
+ * If wait is true, then either an existing offline thread will be restarted
+ * and joined, or if none exists the offline is simply performed in this
+ * context without involving another thread.
+ */
+int journal_file_set_offline(JournalFile *f, bool wait) {
+ int target_state;
+ bool restarted;
+ int r;
+
+ assert(f);
+
+ if (!journal_file_writable(f))
+ return -EPERM;
+
+ if (f->fd < 0 || !f->header)
+ return -EINVAL;
+
+ target_state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
+
+ /* An offlining journal is implicitly online and may modify f->header->state,
+ * we must also join any potentially lingering offline thread when already in
+ * the desired offline state.
+ */
+ if (!journal_file_is_offlining(f) && f->header->state == target_state)
+ return journal_file_set_offline_thread_join(f);
+
+ /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
+ restarted = journal_file_set_offline_try_restart(f);
+ if ((restarted && wait) || !restarted) {
+ r = journal_file_set_offline_thread_join(f);
+ if (r < 0)
+ return r;
+ }
+
+ if (restarted)
+ return 0;
+
+ /* Initiate a new offline. */
+ f->offline_state = OFFLINE_SYNCING;
+
+ if (wait) {
+ /* Without using a thread if waiting. */
+ journal_file_set_offline_internal(f);
+
+ assert(f->offline_state == OFFLINE_DONE);
+ f->offline_state = OFFLINE_JOINED;
+
+ } else {
+ sigset_t ss, saved_ss;
+ int k;
+
+ assert_se(sigfillset(&ss) >= 0);
+ /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
+ * Asynchronous SIGBUS signals can safely be handled by either thread. */
+ assert_se(sigdelset(&ss, SIGBUS) >= 0);
+
+ r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
+ if (r > 0)
+ return -r;
+
+ r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
+
+ k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
+ if (r > 0) {
+ f->offline_state = OFFLINE_JOINED;
+ return -r;
+ }
+ if (k > 0)
+ return -k;
+ }
+
+ return 0;
+}
+
+bool journal_file_is_offlining(JournalFile *f) {
+ assert(f);
+
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+
+ if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
+ return false;
+
+ return true;
+}
+
+void journal_file_write_final_tag(JournalFile *f) {
+ assert(f);
+#if HAVE_GCRYPT
+ if (!JOURNAL_HEADER_SEALED(f->header) || !journal_file_writable(f))
+ return;
+
+ int r = journal_file_append_tag(f);
+ if (r < 0)
+ log_debug_errno(r, "Failed to append tag when closing journal: %m");
+#endif
+}
+
+JournalFile* journal_file_offline_close(JournalFile *f) {
+ if (!f)
+ return NULL;
+
+ journal_file_write_final_tag(f);
+
+ if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
+ journal_file_post_change(f);
+ sd_event_source_disable_unref(f->post_change_timer);
+
+ journal_file_set_offline(f, true);
+
+ return journal_file_close(f);
+}
+
+JournalFile* journal_file_initiate_close(JournalFile *f, Set *deferred_closes) {
+ int r;
+
+ assert(f);
+
+ if (deferred_closes) {
+ r = set_put(deferred_closes, f);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
+ else {
+ (void) journal_file_set_offline(f, false);
+ return NULL;
+ }
+ }
+
+ return journal_file_offline_close(f);
+}
+
+int journal_file_rotate(
+ JournalFile **f,
+ MMapCache *mmap_cache,
+ JournalFileFlags file_flags,
+ uint64_t compress_threshold_bytes,
+ Set *deferred_closes) {
+
+ _cleanup_free_ char *path = NULL;
+ JournalFile *new_file = NULL;
+ int r;
+
+ assert(f);
+ assert(*f);
+
+ journal_file_write_final_tag(*f);
+ r = journal_file_archive(*f, &path);
+ if (r < 0)
+ return r;
+
+ set_clear_with_destructor(deferred_closes, journal_file_offline_close);
+
+ r = journal_file_open(
+ /* fd= */ -1,
+ path,
+ (*f)->open_flags,
+ file_flags,
+ (*f)->mode,
+ compress_threshold_bytes,
+ /* metrics= */ NULL,
+ mmap_cache,
+ /* template= */ *f,
+ &new_file);
+
+ journal_file_initiate_close(*f, deferred_closes);
+ *f = new_file;
+
+ return r;
+}
+
+int journal_file_open_reliably(
+ const char *fname,
+ int open_flags,
+ JournalFileFlags file_flags,
+ mode_t mode,
+ uint64_t compress_threshold_bytes,
+ JournalMetrics *metrics,
+ MMapCache *mmap_cache,
+ JournalFile *template,
+ JournalFile **ret) {
+
+ _cleanup_(journal_file_offline_closep) JournalFile *old_file = NULL;
+ int r;
+
+ r = journal_file_open(
+ /* fd= */ -1,
+ fname,
+ open_flags,
+ file_flags,
+ mode,
+ compress_threshold_bytes,
+ metrics,
+ mmap_cache,
+ template,
+ ret);
+ if (!IN_SET(r,
+ -EBADMSG, /* Corrupted */
+ -EADDRNOTAVAIL, /* Referenced object offset out of bounds */
+ -ENODATA, /* Truncated */
+ -EHOSTDOWN, /* Other machine */
+ -EPROTONOSUPPORT, /* Incompatible feature */
+ -EBUSY, /* Unclean shutdown */
+ -ESHUTDOWN, /* Already archived */
+ -EIO, /* IO error, including SIGBUS on mmap */
+ -EIDRM)) /* File has been deleted */
+ return r;
+
+ if ((open_flags & O_ACCMODE) == O_RDONLY)
+ return r;
+
+ if (!(open_flags & O_CREAT))
+ return r;
+
+ if (!endswith(fname, ".journal"))
+ return r;
+
+ /* The file is corrupted. Rotate it away and try it again (but only once) */
+ log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
+
+ if (!template) {
+ /* The file is corrupted and no template is specified. Try opening it read-only as the
+ * template before rotating to inherit its sequence number and ID. */
+ r = journal_file_open(-1, fname,
+ (open_flags & ~(O_ACCMODE|O_CREAT|O_EXCL)) | O_RDONLY,
+ file_flags, 0, compress_threshold_bytes, NULL,
+ mmap_cache, NULL, &old_file);
+ if (r < 0)
+ log_debug_errno(r, "Failed to continue sequence from file %s, ignoring: %m", fname);
+ else
+ template = old_file;
+ }
+
+ r = journal_file_dispose(AT_FDCWD, fname);
+ if (r < 0)
+ return r;
+
+ return journal_file_open(-1, fname, open_flags, file_flags, mode, compress_threshold_bytes, metrics,
+ mmap_cache, template, ret);
+}
diff --git a/src/shared/journal-file-util.h b/src/shared/journal-file-util.h
new file mode 100644
index 0000000..f9426c4
--- /dev/null
+++ b/src/shared/journal-file-util.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "journal-file.h"
+
+int journal_file_set_offline(JournalFile *f, bool wait);
+bool journal_file_is_offlining(JournalFile *f);
+void journal_file_write_final_tag(JournalFile *f);
+JournalFile* journal_file_offline_close(JournalFile *f);
+DEFINE_TRIVIAL_CLEANUP_FUNC(JournalFile*, journal_file_offline_close);
+
+int journal_file_open_reliably(
+ const char *fname,
+ int open_flags,
+ JournalFileFlags file_flags,
+ mode_t mode,
+ uint64_t compress_threshold_bytes,
+ JournalMetrics *metrics,
+ MMapCache *mmap_cache,
+ JournalFile *template,
+ JournalFile **ret);
+
+JournalFile* journal_file_initiate_close(JournalFile *f, Set *deferred_closes);
+int journal_file_rotate(
+ JournalFile **f,
+ MMapCache *mmap_cache,
+ JournalFileFlags file_flags,
+ uint64_t compress_threshold_bytes,
+ Set *deferred_closes);
diff --git a/src/shared/journal-importer.c b/src/shared/journal-importer.c
new file mode 100644
index 0000000..83e9834
--- /dev/null
+++ b/src/shared/journal-importer.c
@@ -0,0 +1,482 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <malloc.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "io-util.h"
+#include "journal-file.h"
+#include "journal-importer.h"
+#include "journal-util.h"
+#include "parse-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "unaligned.h"
+
+enum {
+ IMPORTER_STATE_LINE = 0, /* waiting to read, or reading line */
+ IMPORTER_STATE_DATA_START, /* reading binary data header */
+ IMPORTER_STATE_DATA, /* reading binary data */
+ IMPORTER_STATE_DATA_FINISH, /* expecting newline */
+ IMPORTER_STATE_EOF, /* done */
+};
+
+void journal_importer_cleanup(JournalImporter *imp) {
+ if (imp->fd >= 0 && !imp->passive_fd) {
+ log_debug("Closing %s (fd=%d)", imp->name ?: "importer", imp->fd);
+ safe_close(imp->fd);
+ }
+
+ free(imp->name);
+ free(imp->buf);
+ iovw_free_contents(&imp->iovw, false);
+}
+
+static char* realloc_buffer(JournalImporter *imp, size_t size) {
+ char *b, *old = ASSERT_PTR(imp)->buf;
+
+ b = GREEDY_REALLOC(imp->buf, size);
+ if (!b)
+ return NULL;
+
+ iovw_rebase(&imp->iovw, old, imp->buf);
+
+ return b;
+}
+
+static int get_line(JournalImporter *imp, char **line, size_t *size) {
+ ssize_t n;
+ char *c = NULL;
+
+ assert(imp);
+ assert(imp->state == IMPORTER_STATE_LINE);
+ assert(imp->offset <= imp->filled);
+ assert(imp->filled <= MALLOC_SIZEOF_SAFE(imp->buf));
+ assert(imp->fd >= 0);
+
+ for (;;) {
+ if (imp->buf) {
+ size_t start = MAX(imp->scanned, imp->offset);
+
+ c = memchr(imp->buf + start, '\n',
+ imp->filled - start);
+ if (c)
+ break;
+ }
+
+ imp->scanned = imp->filled;
+ if (imp->scanned >= DATA_SIZE_MAX)
+ return log_warning_errno(SYNTHETIC_ERRNO(ENOBUFS),
+ "Entry is bigger than %u bytes.",
+ DATA_SIZE_MAX);
+
+ if (imp->passive_fd)
+ /* we have to wait for some data to come to us */
+ return -EAGAIN;
+
+ /* We know that imp->filled is at most DATA_SIZE_MAX, so if
+ we reallocate it, we'll increase the size at least a bit. */
+ assert_cc(DATA_SIZE_MAX < ENTRY_SIZE_MAX);
+ if (MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled < LINE_CHUNK &&
+ !realloc_buffer(imp, MIN(imp->filled + LINE_CHUNK, ENTRY_SIZE_MAX)))
+ return log_oom();
+
+ assert(imp->buf);
+ assert(MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled >= LINE_CHUNK ||
+ MALLOC_SIZEOF_SAFE(imp->buf) >= ENTRY_SIZE_MAX);
+
+ n = read(imp->fd,
+ imp->buf + imp->filled,
+ MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled);
+ if (n < 0) {
+ if (errno != EAGAIN)
+ log_error_errno(errno, "read(%d, ..., %zu): %m",
+ imp->fd,
+ MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled);
+ return -errno;
+ } else if (n == 0)
+ return 0;
+
+ imp->filled += n;
+ }
+
+ *line = imp->buf + imp->offset;
+ *size = c + 1 - imp->buf - imp->offset;
+ imp->offset += *size;
+
+ return 1;
+}
+
+static int fill_fixed_size(JournalImporter *imp, void **data, size_t size) {
+
+ assert(imp);
+ assert(IN_SET(imp->state, IMPORTER_STATE_DATA_START, IMPORTER_STATE_DATA, IMPORTER_STATE_DATA_FINISH));
+ assert(size <= DATA_SIZE_MAX);
+ assert(imp->offset <= imp->filled);
+ assert(imp->filled <= MALLOC_SIZEOF_SAFE(imp->buf));
+ assert(imp->fd >= 0);
+ assert(data);
+
+ while (imp->filled - imp->offset < size) {
+ int n;
+
+ if (imp->passive_fd)
+ /* we have to wait for some data to come to us */
+ return -EAGAIN;
+
+ if (!realloc_buffer(imp, imp->offset + size))
+ return log_oom();
+
+ n = read(imp->fd, imp->buf + imp->filled,
+ MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled);
+ if (n < 0) {
+ if (errno != EAGAIN)
+ log_error_errno(errno, "read(%d, ..., %zu): %m", imp->fd,
+ MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled);
+ return -errno;
+ } else if (n == 0)
+ return 0;
+
+ imp->filled += n;
+ }
+
+ *data = imp->buf + imp->offset;
+ imp->offset += size;
+
+ return 1;
+}
+
+static int get_data_size(JournalImporter *imp) {
+ int r;
+ void *data;
+
+ assert(imp);
+ assert(imp->state == IMPORTER_STATE_DATA_START);
+ assert(imp->data_size == 0);
+
+ r = fill_fixed_size(imp, &data, sizeof(uint64_t));
+ if (r <= 0)
+ return r;
+
+ imp->data_size = unaligned_read_le64(data);
+ if (imp->data_size > DATA_SIZE_MAX)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Stream declares field with size %zu > DATA_SIZE_MAX = %u",
+ imp->data_size, DATA_SIZE_MAX);
+ if (imp->data_size == 0)
+ log_warning("Binary field with zero length");
+
+ return 1;
+}
+
+static int get_data_data(JournalImporter *imp, void **data) {
+ int r;
+
+ assert(imp);
+ assert(data);
+ assert(imp->state == IMPORTER_STATE_DATA);
+
+ r = fill_fixed_size(imp, data, imp->data_size);
+ if (r <= 0)
+ return r;
+
+ return 1;
+}
+
+static int get_data_newline(JournalImporter *imp) {
+ int r;
+ char *data;
+
+ assert(imp);
+ assert(imp->state == IMPORTER_STATE_DATA_FINISH);
+
+ r = fill_fixed_size(imp, (void**) &data, 1);
+ if (r <= 0)
+ return r;
+
+ assert(data);
+ if (*data != '\n') {
+ char buf[4];
+ int l;
+
+ l = cescape_char(*data, buf);
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Expected newline, got '%.*s'", l, buf);
+ }
+
+ return 1;
+}
+
+static int process_special_field(JournalImporter *imp, char *line) {
+ const char *value;
+ char buf[CELLESCAPE_DEFAULT_LENGTH];
+ int r;
+
+ assert(line);
+
+ if (STARTSWITH_SET(line, "__CURSOR=", "__SEQNUM=", "__SEQNUM_ID="))
+ /* ignore __CURSOR=, __SEQNUM=, __SEQNUM_ID= which we cannot replicate */
+ return 1;
+
+ value = startswith(line, "__REALTIME_TIMESTAMP=");
+ if (value) {
+ uint64_t x;
+
+ r = safe_atou64(value, &x);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse __REALTIME_TIMESTAMP '%s': %m",
+ cellescape(buf, sizeof buf, value));
+ else if (!VALID_REALTIME(x)) {
+ log_warning("__REALTIME_TIMESTAMP out of range, ignoring: %"PRIu64, x);
+ return -ERANGE;
+ }
+
+ imp->ts.realtime = x;
+ return 1;
+ }
+
+ value = startswith(line, "__MONOTONIC_TIMESTAMP=");
+ if (value) {
+ uint64_t x;
+
+ r = safe_atou64(value, &x);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse __MONOTONIC_TIMESTAMP '%s': %m",
+ cellescape(buf, sizeof buf, value));
+ else if (!VALID_MONOTONIC(x)) {
+ log_warning("__MONOTONIC_TIMESTAMP out of range, ignoring: %"PRIu64, x);
+ return -ERANGE;
+ }
+
+ imp->ts.monotonic = x;
+ return 1;
+ }
+
+ /* Just a single underline, but it needs special treatment too. */
+ value = startswith(line, "_BOOT_ID=");
+ if (value) {
+ r = sd_id128_from_string(value, &imp->boot_id);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse _BOOT_ID '%s': %m",
+ cellescape(buf, sizeof buf, value));
+
+ /* store the field in the usual fashion too */
+ return 0;
+ }
+
+ value = startswith(line, "__");
+ if (value) {
+ log_notice("Unknown dunder line __%s, ignoring.", cellescape(buf, sizeof buf, value));
+ return 1;
+ }
+
+ /* no dunder */
+ return 0;
+}
+
+int journal_importer_process_data(JournalImporter *imp) {
+ int r;
+
+ switch (imp->state) {
+ case IMPORTER_STATE_LINE: {
+ char *line, *sep;
+ size_t n = 0;
+
+ assert(imp->data_size == 0);
+
+ r = get_line(imp, &line, &n);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ imp->state = IMPORTER_STATE_EOF;
+ return 0;
+ }
+ assert(n > 0);
+ assert(line[n-1] == '\n');
+
+ if (n == 1) {
+ log_trace("Received empty line, event is ready");
+ return 1;
+ }
+
+ /* MESSAGE=xxx\n
+ or
+ COREDUMP\n
+ LLLLLLLL0011223344...\n
+ */
+ sep = memchr(line, '=', n);
+ if (sep) {
+ /* chomp newline */
+ n--;
+
+ if (!journal_field_valid(line, sep - line, true)) {
+ char buf[64], *t;
+
+ t = strndupa_safe(line, sep - line);
+ log_debug("Ignoring invalid field: \"%s\"",
+ cellescape(buf, sizeof buf, t));
+
+ return 0;
+ }
+
+ line[n] = '\0';
+ r = process_special_field(imp, line);
+ if (r != 0)
+ return r < 0 ? r : 0;
+
+ r = iovw_put(&imp->iovw, line, n);
+ if (r < 0)
+ return r;
+ } else {
+ if (!journal_field_valid(line, n - 1, true)) {
+ char buf[64], *t;
+
+ t = strndupa_safe(line, n - 1);
+ log_debug("Ignoring invalid field: \"%s\"",
+ cellescape(buf, sizeof buf, t));
+
+ return 0;
+ }
+
+ /* replace \n with = */
+ line[n-1] = '=';
+
+ imp->field_len = n;
+ imp->state = IMPORTER_STATE_DATA_START;
+
+ /* we cannot put the field in iovec until we have all data */
+ }
+
+ log_trace("Received: %.*s (%s)", (int) n, line, sep ? "text" : "binary");
+
+ return 0; /* continue */
+ }
+
+ case IMPORTER_STATE_DATA_START:
+ assert(imp->data_size == 0);
+
+ r = get_data_size(imp);
+ // log_debug("get_data_size() -> %d", r);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ imp->state = IMPORTER_STATE_EOF;
+ return 0;
+ }
+
+ imp->state = imp->data_size > 0 ?
+ IMPORTER_STATE_DATA : IMPORTER_STATE_DATA_FINISH;
+
+ return 0; /* continue */
+
+ case IMPORTER_STATE_DATA: {
+ void *data;
+ char *field;
+
+ assert(imp->data_size > 0);
+
+ r = get_data_data(imp, &data);
+ // log_debug("get_data_data() -> %d", r);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ imp->state = IMPORTER_STATE_EOF;
+ return 0;
+ }
+
+ assert(data);
+
+ field = (char*) data - sizeof(uint64_t) - imp->field_len;
+ memmove(field + sizeof(uint64_t), field, imp->field_len);
+
+ r = iovw_put(&imp->iovw, field + sizeof(uint64_t), imp->field_len + imp->data_size);
+ if (r < 0)
+ return r;
+
+ imp->state = IMPORTER_STATE_DATA_FINISH;
+
+ return 0; /* continue */
+ }
+
+ case IMPORTER_STATE_DATA_FINISH:
+ r = get_data_newline(imp);
+ // log_debug("get_data_newline() -> %d", r);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ imp->state = IMPORTER_STATE_EOF;
+ return 0;
+ }
+
+ imp->data_size = 0;
+ imp->state = IMPORTER_STATE_LINE;
+
+ return 0; /* continue */
+ default:
+ assert_not_reached();
+ }
+}
+
+int journal_importer_push_data(JournalImporter *imp, const char *data, size_t size) {
+ assert(imp);
+ assert(imp->state != IMPORTER_STATE_EOF);
+
+ if (!realloc_buffer(imp, imp->filled + size))
+ return log_error_errno(ENOMEM,
+ "Failed to store received data of size %zu "
+ "(in addition to existing %zu bytes with %zu filled): %m",
+ size, MALLOC_SIZEOF_SAFE(imp->buf), imp->filled);
+
+ memcpy(imp->buf + imp->filled, data, size);
+ imp->filled += size;
+
+ return 0;
+}
+
+void journal_importer_drop_iovw(JournalImporter *imp) {
+ size_t remain, target;
+
+ /* This function drops processed data that along with the iovw that points at it */
+
+ iovw_free_contents(&imp->iovw, false);
+
+ /* possibly reset buffer position */
+ remain = imp->filled - imp->offset;
+
+ if (remain == 0) /* no brainer */
+ imp->offset = imp->scanned = imp->filled = 0;
+ else if (imp->offset > MALLOC_SIZEOF_SAFE(imp->buf) - imp->filled &&
+ imp->offset > remain) {
+ memcpy(imp->buf, imp->buf + imp->offset, remain);
+ imp->offset = imp->scanned = 0;
+ imp->filled = remain;
+ }
+
+ target = MALLOC_SIZEOF_SAFE(imp->buf);
+ while (target > 16 * LINE_CHUNK && imp->filled < target / 2)
+ target /= 2;
+ if (target < MALLOC_SIZEOF_SAFE(imp->buf)) {
+ char *tmp;
+ size_t old_size;
+
+ old_size = MALLOC_SIZEOF_SAFE(imp->buf);
+
+ tmp = realloc(imp->buf, target);
+ if (!tmp)
+ log_warning("Failed to reallocate buffer to (smaller) size %zu",
+ target);
+ else {
+ log_debug("Reallocated buffer from %zu to %zu bytes",
+ old_size, target);
+ imp->buf = tmp;
+ }
+ }
+}
+
+bool journal_importer_eof(const JournalImporter *imp) {
+ return imp->state == IMPORTER_STATE_EOF;
+}
diff --git a/src/shared/journal-importer.h b/src/shared/journal-importer.h
new file mode 100644
index 0000000..d84dcc4
--- /dev/null
+++ b/src/shared/journal-importer.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/uio.h>
+
+#include "sd-id128.h"
+
+#include "io-util.h"
+#include "iovec-wrapper.h"
+#include "time-util.h"
+
+/* Make sure not to make this smaller than the maximum coredump size.
+ * See JOURNAL_SIZE_MAX in coredump.c */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#define ENTRY_SIZE_MAX (1024*1024*770u)
+#define DATA_SIZE_MAX (1024*1024*768u)
+#else
+#define ENTRY_SIZE_MAX (1024*1024*13u)
+#define DATA_SIZE_MAX (1024*1024*11u)
+#endif
+#define LINE_CHUNK 8*1024u
+
+/* The maximum number of fields in an entry */
+#define ENTRY_FIELD_COUNT_MAX 1024u
+
+typedef struct JournalImporter {
+ int fd;
+ bool passive_fd;
+ char *name;
+
+ char *buf;
+ size_t offset; /* offset to the beginning of live data in the buffer */
+ size_t scanned; /* number of bytes since the beginning of data without a newline */
+ size_t filled; /* total number of bytes in the buffer */
+
+ size_t field_len; /* used for binary fields: the field name length */
+ size_t data_size; /* and the size of the binary data chunk being processed */
+
+ struct iovec_wrapper iovw;
+
+ int state;
+ dual_timestamp ts;
+ sd_id128_t boot_id;
+} JournalImporter;
+
+#define JOURNAL_IMPORTER_INIT(_fd) { .fd = (_fd), .iovw = {} }
+#define JOURNAL_IMPORTER_MAKE(_fd) (JournalImporter) JOURNAL_IMPORTER_INIT(_fd)
+
+void journal_importer_cleanup(JournalImporter *);
+int journal_importer_process_data(JournalImporter *);
+int journal_importer_push_data(JournalImporter *, const char *data, size_t size);
+void journal_importer_drop_iovw(JournalImporter *);
+bool journal_importer_eof(const JournalImporter *);
+
+static inline size_t journal_importer_bytes_remaining(const JournalImporter *imp) {
+ return imp->filled;
+}
diff --git a/src/shared/journal-util.c b/src/shared/journal-util.c
new file mode 100644
index 0000000..d73d7c4
--- /dev/null
+++ b/src/shared/journal-util.c
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "acl-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "hashmap.h"
+#include "journal-internal.h"
+#include "journal-util.h"
+#include "log.h"
+#include "strv.h"
+#include "user-util.h"
+
+static int access_check_var_log_journal(sd_journal *j, bool want_other_users) {
+ int r;
+
+ assert(j);
+
+ /* If we are root, we should have access, don't warn. */
+ if (getuid() == 0)
+ return 0;
+
+ /* If we are in the 'systemd-journal' group, we should have
+ * access too. */
+ r = in_group("systemd-journal");
+ if (r < 0)
+ return log_error_errno(r, "Failed to check if we are in the 'systemd-journal' group: %m");
+ if (r > 0)
+ return 0;
+
+#if HAVE_ACL
+ _cleanup_strv_free_ char **g = NULL;
+ const char* dir;
+
+ if (laccess("/run/log/journal", F_OK) >= 0)
+ dir = "/run/log/journal";
+ else
+ dir = "/var/log/journal";
+
+ /* If we are in any of the groups listed in the journal ACLs,
+ * then all is good, too. Let's enumerate all groups from the
+ * default ACL of the directory, which generally should allow
+ * access to most journal files too. */
+ r = acl_search_groups(dir, &g);
+ if (r < 0)
+ return log_error_errno(r, "Failed to search journal ACL: %m");
+ if (r > 0)
+ return 0;
+
+ /* Print a pretty list, if there were ACLs set. */
+ if (!strv_isempty(g)) {
+ _cleanup_free_ char *s = NULL;
+
+ /* There are groups in the ACL, let's list them */
+ r = strv_extend(&g, "systemd-journal");
+ if (r < 0)
+ return log_oom();
+
+ strv_sort(g);
+ strv_uniq(g);
+
+ s = strv_join(g, "', '");
+ if (!s)
+ return log_oom();
+
+ log_notice("Hint: You are currently not seeing messages from %s.\n"
+ " Users in groups '%s' can see all messages.\n"
+ " Pass -q to turn off this notice.",
+ want_other_users ? "other users and the system" : "the system",
+ s);
+ return 1;
+ }
+#endif
+
+ /* If no ACLs were found, print a short version of the message. */
+ log_notice("Hint: You are currently not seeing messages from %s.\n"
+ " Users in the 'systemd-journal' group can see all messages. Pass -q to\n"
+ " turn off this notice.",
+ want_other_users ? "other users and the system" : "the system");
+
+ return 1;
+}
+
+int journal_access_blocked(sd_journal *j) {
+ return hashmap_contains(j->errors, INT_TO_PTR(-EACCES));
+}
+
+int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_users) {
+ void *code;
+ char *path;
+ int r = 0;
+
+ assert(j);
+
+ if (hashmap_isempty(j->errors)) {
+ if (ordered_hashmap_isempty(j->files) && !quiet)
+ log_notice("No journal files were found.");
+
+ return 0;
+ }
+
+ if (journal_access_blocked(j)) {
+ if (!quiet)
+ (void) access_check_var_log_journal(j, want_other_users);
+
+ if (ordered_hashmap_isempty(j->files))
+ r = log_error_errno(EACCES, "No journal files were opened due to insufficient permissions.");
+ }
+
+ HASHMAP_FOREACH_KEY(path, code, j->errors) {
+ int err;
+
+ err = abs(PTR_TO_INT(code));
+
+ switch (err) {
+ case EACCES:
+ continue;
+
+ case ENODATA:
+ log_warning_errno(err, "Journal file %s is truncated, ignoring file.", path);
+ break;
+
+ case EPROTONOSUPPORT:
+ log_warning_errno(err, "Journal file %1$s uses an unsupported feature, ignoring file.\n"
+ "Use SYSTEMD_LOG_LEVEL=debug journalctl --file=%1$s to see the details.",
+ path);
+ break;
+
+ case EBADMSG:
+ log_warning_errno(err, "Journal file %s corrupted, ignoring file.", path);
+ break;
+
+ case ETOOMANYREFS:
+ log_warning_errno(err, "Too many journal files (limit is at %u) in scope, ignoring file '%s'.", JOURNAL_FILES_MAX, path);
+ break;
+
+ default:
+ log_warning_errno(err, "An error was encountered while opening journal file or directory %s, ignoring file: %m", path);
+ break;
+ }
+ }
+
+ return r;
+}
+
+int journal_open_machine(sd_journal **ret, const char *machine) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL;
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_(sd_journal_closep) sd_journal *j = NULL;
+ _cleanup_close_ int machine_fd = -EBADF;
+ int fd, r;
+
+ assert(ret);
+ assert(machine);
+
+ if (geteuid() != 0)
+ /* The file descriptor returned by OpenMachineRootDirectory() will be owned by users/groups of
+ * the container, thus we need root privileges to override them. */
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Using the --machine= switch requires root privileges.");
+
+ r = sd_bus_open_system(&bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open system bus: %m");
+
+ r = bus_call_method(bus, bus_machine_mgr, "OpenMachineRootDirectory", &error, &reply, "s", machine);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open root directory of machine '%s': %s",
+ machine, bus_error_message(&error, r));
+
+ r = sd_bus_message_read(reply, "h", &fd);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ machine_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (machine_fd < 0)
+ return log_error_errno(errno, "Failed to duplicate file descriptor: %m");
+
+ r = sd_journal_open_directory_fd(&j, machine_fd, SD_JOURNAL_OS_ROOT | SD_JOURNAL_TAKE_DIRECTORY_FD);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open journal in machine '%s': %m", machine);
+
+ TAKE_FD(machine_fd);
+ *ret = TAKE_PTR(j);
+ return 0;
+}
diff --git a/src/shared/journal-util.h b/src/shared/journal-util.h
new file mode 100644
index 0000000..afad249
--- /dev/null
+++ b/src/shared/journal-util.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "sd-journal.h"
+
+int journal_access_blocked(sd_journal *j);
+int journal_access_check_and_warn(sd_journal *j, bool quiet, bool want_other_users);
+int journal_open_machine(sd_journal **ret, const char *machine);
diff --git a/src/shared/json-internal.h b/src/shared/json-internal.h
new file mode 100644
index 0000000..a94befa
--- /dev/null
+++ b/src/shared/json-internal.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include "json.h"
+
+/* This header should include all prototypes only the JSON parser itself and
+ * its tests need access to. Normal code consuming the JSON parser should not
+ * interface with this. */
+
+typedef union JsonValue {
+ /* Encodes a simple value. This structure is generally 8 bytes wide (as double is 64-bit). */
+ bool boolean;
+ double real;
+ int64_t integer;
+ uint64_t unsig;
+} JsonValue;
+
+/* Let's protect us against accidental structure size changes on our most relevant arch */
+#ifdef __x86_64__
+assert_cc(sizeof(JsonValue) == 8U);
+#endif
+
+#define JSON_VALUE_NULL ((JsonValue) {})
+
+/* We use fake JsonVariant objects for some special values, in order to avoid memory allocations for them. Note that
+ * effectively this means that there are multiple ways to encode the same objects: via these magic values or as
+ * properly allocated JsonVariant. We convert between both on-the-fly as necessary. */
+enum
+{
+ _JSON_VARIANT_MAGIC_TRUE = 1,
+#define JSON_VARIANT_MAGIC_TRUE ((JsonVariant*) _JSON_VARIANT_MAGIC_TRUE)
+ _JSON_VARIANT_MAGIC_FALSE,
+#define JSON_VARIANT_MAGIC_FALSE ((JsonVariant*) _JSON_VARIANT_MAGIC_FALSE)
+ _JSON_VARIANT_MAGIC_NULL,
+#define JSON_VARIANT_MAGIC_NULL ((JsonVariant*) _JSON_VARIANT_MAGIC_NULL)
+ _JSON_VARIANT_MAGIC_ZERO_INTEGER,
+#define JSON_VARIANT_MAGIC_ZERO_INTEGER ((JsonVariant*) _JSON_VARIANT_MAGIC_ZERO_INTEGER)
+ _JSON_VARIANT_MAGIC_ZERO_UNSIGNED,
+#define JSON_VARIANT_MAGIC_ZERO_UNSIGNED ((JsonVariant*) _JSON_VARIANT_MAGIC_ZERO_UNSIGNED)
+ _JSON_VARIANT_MAGIC_ZERO_REAL,
+#define JSON_VARIANT_MAGIC_ZERO_REAL ((JsonVariant*) _JSON_VARIANT_MAGIC_ZERO_REAL)
+ _JSON_VARIANT_MAGIC_EMPTY_STRING,
+#define JSON_VARIANT_MAGIC_EMPTY_STRING ((JsonVariant*) _JSON_VARIANT_MAGIC_EMPTY_STRING)
+ _JSON_VARIANT_MAGIC_EMPTY_ARRAY,
+#define JSON_VARIANT_MAGIC_EMPTY_ARRAY ((JsonVariant*) _JSON_VARIANT_MAGIC_EMPTY_ARRAY)
+ _JSON_VARIANT_MAGIC_EMPTY_OBJECT,
+#define JSON_VARIANT_MAGIC_EMPTY_OBJECT ((JsonVariant*) _JSON_VARIANT_MAGIC_EMPTY_OBJECT)
+ __JSON_VARIANT_MAGIC_MAX
+#define _JSON_VARIANT_MAGIC_MAX ((JsonVariant*) __JSON_VARIANT_MAGIC_MAX)
+};
+
+/* This is only safe as long as we don't define more than 4K magic pointers, i.e. the page size of the simplest
+ * architectures we support. That's because we rely on the fact that malloc() will never allocate from the first memory
+ * page, as it is a faulting page for catching NULL pointer dereferences. */
+assert_cc((unsigned) __JSON_VARIANT_MAGIC_MAX < 4096U);
+
+enum { /* JSON tokens */
+ JSON_TOKEN_END,
+ JSON_TOKEN_COLON,
+ JSON_TOKEN_COMMA,
+ JSON_TOKEN_OBJECT_OPEN,
+ JSON_TOKEN_OBJECT_CLOSE,
+ JSON_TOKEN_ARRAY_OPEN,
+ JSON_TOKEN_ARRAY_CLOSE,
+ JSON_TOKEN_STRING,
+ JSON_TOKEN_REAL,
+ JSON_TOKEN_INTEGER,
+ JSON_TOKEN_UNSIGNED,
+ JSON_TOKEN_BOOLEAN,
+ JSON_TOKEN_NULL,
+ _JSON_TOKEN_MAX,
+ _JSON_TOKEN_INVALID = -EINVAL,
+};
+
+int json_tokenize(const char **p, char **ret_string, JsonValue *ret_value, unsigned *ret_line, unsigned *ret_column, void **state, unsigned *line, unsigned *column);
diff --git a/src/shared/json.c b/src/shared/json.c
new file mode 100644
index 0000000..06c9e85
--- /dev/null
+++ b/src/shared/json.c
@@ -0,0 +1,5132 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include "sd-messages.h"
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "float.h"
+#include "hexdecoct.h"
+#include "json-internal.h"
+#include "json.h"
+#include "macro.h"
+#include "math-util.h"
+#include "memory-util.h"
+#include "memstream-util.h"
+#include "set.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "user-util.h"
+#include "utf8.h"
+
+/* Refuse putting together variants with a larger depth than 2K by default (as a protection against overflowing stacks
+ * if code processes JSON objects recursively. Note that we store the depth in an uint16_t, hence make sure this
+ * remains under 2^16.
+ *
+ * The value first was 16k, but it was discovered to be too high on llvm/x86-64. See also:
+ * https://github.com/systemd/systemd/issues/10738
+ *
+ * The value then was 4k, but it was discovered to be too high on s390x/aarch64. See also:
+ * https://github.com/systemd/systemd/issues/14396 */
+
+#define DEPTH_MAX (2U*1024U)
+assert_cc(DEPTH_MAX <= UINT16_MAX);
+
+typedef struct JsonSource {
+ /* When we parse from a file or similar, encodes the filename, to indicate the source of a json variant */
+ unsigned n_ref;
+ unsigned max_line;
+ unsigned max_column;
+ char name[];
+} JsonSource;
+
+/* On x86-64 this whole structure should have a size of 6 * 64 bit = 48 bytes */
+struct JsonVariant {
+ union {
+ /* We either maintain a reference counter for this variant itself, or we are embedded into an
+ * array/object, in which case only that surrounding object is ref-counted. (If 'embedded' is false,
+ * see below.) */
+ unsigned n_ref;
+
+ /* If this JsonVariant is part of an array/object, then this field points to the surrounding
+ * JSON_VARIANT_ARRAY/JSON_VARIANT_OBJECT object. (If 'embedded' is true, see below.) */
+ JsonVariant *parent;
+ };
+
+ /* If this was parsed from some file or buffer, this stores where from, as well as the source line/column */
+ JsonSource *source;
+ unsigned line, column;
+
+ /* The current 'depth' of the JsonVariant, i.e. how many levels of member variants this has */
+ uint16_t depth;
+
+ JsonVariantType type:8;
+
+ /* A marker whether this variant is embedded into in array/object or not. If true, the 'parent' pointer above
+ * is valid. If false, the 'n_ref' field above is valid instead. */
+ bool is_embedded:1;
+
+ /* In some conditions (for example, if this object is part of an array of strings or objects), we don't store
+ * any data inline, but instead simply reference an external object and act as surrogate of it. In that case
+ * this bool is set, and the external object is referenced through the .reference field below. */
+ bool is_reference:1;
+
+ /* While comparing two arrays, we use this for marking what we already have seen */
+ bool is_marked:1;
+
+ /* Erase from memory when freeing */
+ bool sensitive:1;
+
+ /* If this is an object the fields are strictly ordered by name */
+ bool sorted:1;
+
+ /* If in addition to this object all objects referenced by it are also ordered strictly by name */
+ bool normalized:1;
+
+ union {
+ /* For simple types we store the value in-line. */
+ JsonValue value;
+
+ /* For objects and arrays we store the number of elements immediately following */
+ size_t n_elements;
+
+ /* If is_reference as indicated above is set, this is where the reference object is actually stored. */
+ JsonVariant *reference;
+
+ /* Strings are placed immediately after the structure. Note that when this is a JsonVariant
+ * embedded into an array we might encode strings up to INLINE_STRING_LENGTH characters
+ * directly inside the element, while longer strings are stored as references. When this
+ * object is not embedded into an array, but stand-alone, we allocate the right size for the
+ * whole structure, i.e. the array might be much larger than INLINE_STRING_LENGTH. */
+ DECLARE_FLEX_ARRAY(char, string);
+ };
+};
+
+/* Inside string arrays we have a series of JsonVariant structures one after the other. In this case, strings longer
+ * than INLINE_STRING_MAX are stored as references, and all shorter ones inline. (This means — on x86-64 — strings up
+ * to 7 chars are stored within the array elements, and all others in separate allocations) */
+#define INLINE_STRING_MAX (sizeof(JsonVariant) - offsetof(JsonVariant, string) - 1U)
+
+/* Let's make sure this structure isn't increased in size accidentally. This check is only for our most relevant arch
+ * (x86-64). */
+#if defined(__x86_64__) && __SIZEOF_POINTER__ == 8
+assert_cc(sizeof(JsonVariant) == 40U);
+assert_cc(INLINE_STRING_MAX == 7U);
+#endif
+
+static JsonSource* json_source_new(const char *name) {
+ JsonSource *s;
+
+ assert(name);
+
+ s = malloc(offsetof(JsonSource, name) + strlen(name) + 1);
+ if (!s)
+ return NULL;
+
+ *s = (JsonSource) {
+ .n_ref = 1,
+ };
+ strcpy(s->name, name);
+
+ return s;
+}
+
+DEFINE_PRIVATE_TRIVIAL_REF_UNREF_FUNC(JsonSource, json_source, mfree);
+
+static bool json_source_equal(JsonSource *a, JsonSource *b) {
+ if (a == b)
+ return true;
+
+ if (!a || !b)
+ return false;
+
+ return streq(a->name, b->name);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(JsonSource*, json_source_unref);
+
+/* There are four kind of JsonVariant* pointers:
+ *
+ * 1. NULL
+ * 2. A 'regular' one, i.e. pointing to malloc() memory
+ * 3. A 'magic' one, i.e. one of the special JSON_VARIANT_MAGIC_XYZ values, that encode a few very basic values directly in the pointer.
+ * 4. A 'const string' one, i.e. a pointer to a const string.
+ *
+ * The four kinds of pointers can be discerned like this:
+ *
+ * Detecting #1 is easy, just compare with NULL. Detecting #3 is similarly easy: all magic pointers are below
+ * _JSON_VARIANT_MAGIC_MAX (which is pretty low, within the first memory page, which is special on Linux and other
+ * OSes, as it is a faulting page). In order to discern #2 and #4 we check the lowest bit. If it's off it's #2,
+ * otherwise #4. This makes use of the fact that malloc() will return "maximum aligned" memory, which definitely
+ * means the pointer is even. This means we can use the uneven pointers to reference static strings, as long as we
+ * make sure that all static strings used like this are aligned to 2 (or higher), and that we mask the bit on
+ * access. The JSON_VARIANT_STRING_CONST() macro encodes strings as JsonVariant* pointers, with the bit set. */
+
+static bool json_variant_is_magic(const JsonVariant *v) {
+ if (!v)
+ return false;
+
+ return v < _JSON_VARIANT_MAGIC_MAX;
+}
+
+static bool json_variant_is_const_string(const JsonVariant *v) {
+
+ if (v < _JSON_VARIANT_MAGIC_MAX)
+ return false;
+
+ /* A proper JsonVariant is aligned to whatever malloc() aligns things too, which is definitely not uneven. We
+ * hence use all uneven pointers as indicators for const strings. */
+
+ return (((uintptr_t) v) & 1) != 0;
+}
+
+static bool json_variant_is_regular(const JsonVariant *v) {
+
+ if (v < _JSON_VARIANT_MAGIC_MAX)
+ return false;
+
+ return (((uintptr_t) v) & 1) == 0;
+}
+
+static JsonVariant *json_variant_dereference(JsonVariant *v) {
+
+ /* Recursively dereference variants that are references to other variants */
+
+ if (!v)
+ return NULL;
+
+ if (!json_variant_is_regular(v))
+ return v;
+
+ if (!v->is_reference)
+ return v;
+
+ return json_variant_dereference(v->reference);
+}
+
+static uint16_t json_variant_depth(JsonVariant *v) {
+
+ v = json_variant_dereference(v);
+ if (!v)
+ return 0;
+
+ if (!json_variant_is_regular(v))
+ return 0;
+
+ return v->depth;
+}
+
+static JsonVariant *json_variant_formalize(JsonVariant *v) {
+
+ /* Converts json variant pointers to their normalized form, i.e. fully dereferenced and wherever
+ * possible converted to the "magic" version if there is one */
+
+ if (!v)
+ return NULL;
+
+ v = json_variant_dereference(v);
+
+ switch (json_variant_type(v)) {
+
+ case JSON_VARIANT_BOOLEAN:
+ return json_variant_boolean(v) ? JSON_VARIANT_MAGIC_TRUE : JSON_VARIANT_MAGIC_FALSE;
+
+ case JSON_VARIANT_NULL:
+ return JSON_VARIANT_MAGIC_NULL;
+
+ case JSON_VARIANT_INTEGER:
+ return json_variant_integer(v) == 0 ? JSON_VARIANT_MAGIC_ZERO_INTEGER : v;
+
+ case JSON_VARIANT_UNSIGNED:
+ return json_variant_unsigned(v) == 0 ? JSON_VARIANT_MAGIC_ZERO_UNSIGNED : v;
+
+ case JSON_VARIANT_REAL:
+ return iszero_safe(json_variant_real(v)) ? JSON_VARIANT_MAGIC_ZERO_REAL : v;
+
+ case JSON_VARIANT_STRING:
+ return isempty(json_variant_string(v)) ? JSON_VARIANT_MAGIC_EMPTY_STRING : v;
+
+ case JSON_VARIANT_ARRAY:
+ return json_variant_elements(v) == 0 ? JSON_VARIANT_MAGIC_EMPTY_ARRAY : v;
+
+ case JSON_VARIANT_OBJECT:
+ return json_variant_elements(v) == 0 ? JSON_VARIANT_MAGIC_EMPTY_OBJECT : v;
+
+ default:
+ return v;
+ }
+}
+
+static JsonVariant *json_variant_conservative_formalize(JsonVariant *v) {
+
+ /* Much like json_variant_formalize(), but won't simplify if the variant has a source/line location
+ * attached to it, in order not to lose context */
+
+ if (!v)
+ return NULL;
+
+ if (!json_variant_is_regular(v))
+ return v;
+
+ if (v->source || v->line > 0 || v->column > 0)
+ return v;
+
+ return json_variant_formalize(v);
+}
+
+static int json_variant_new(JsonVariant **ret, JsonVariantType type, size_t space) {
+ JsonVariant *v;
+
+ assert_return(ret, -EINVAL);
+
+ v = malloc0(MAX(sizeof(JsonVariant),
+ offsetof(JsonVariant, value) + space));
+ if (!v)
+ return -ENOMEM;
+
+ v->n_ref = 1;
+ v->type = type;
+
+ *ret = v;
+ return 0;
+}
+
+int json_variant_new_integer(JsonVariant **ret, int64_t i) {
+ JsonVariant *v;
+ int r;
+
+ assert_return(ret, -EINVAL);
+
+ if (i == 0) {
+ *ret = JSON_VARIANT_MAGIC_ZERO_INTEGER;
+ return 0;
+ }
+
+ r = json_variant_new(&v, JSON_VARIANT_INTEGER, sizeof(i));
+ if (r < 0)
+ return r;
+
+ v->value.integer = i;
+ *ret = v;
+
+ return 0;
+}
+
+int json_variant_new_unsigned(JsonVariant **ret, uint64_t u) {
+ JsonVariant *v;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ if (u == 0) {
+ *ret = JSON_VARIANT_MAGIC_ZERO_UNSIGNED;
+ return 0;
+ }
+
+ r = json_variant_new(&v, JSON_VARIANT_UNSIGNED, sizeof(u));
+ if (r < 0)
+ return r;
+
+ v->value.unsig = u;
+ *ret = v;
+
+ return 0;
+}
+
+int json_variant_new_real(JsonVariant **ret, double d) {
+ JsonVariant *v;
+ int r;
+
+ assert_return(ret, -EINVAL);
+
+ r = fpclassify(d);
+ switch (r) {
+ case FP_NAN:
+ case FP_INFINITE:
+ /* JSON doesn't know NaN, +Infinity or -Infinity. Let's silently convert to 'null'. */
+ *ret = JSON_VARIANT_MAGIC_NULL;
+ return 0;
+
+ case FP_ZERO:
+ *ret = JSON_VARIANT_MAGIC_ZERO_REAL;
+ return 0;
+ }
+
+ r = json_variant_new(&v, JSON_VARIANT_REAL, sizeof(d));
+ if (r < 0)
+ return r;
+
+ v->value.real = d;
+ *ret = v;
+
+ return 0;
+}
+
+int json_variant_new_boolean(JsonVariant **ret, bool b) {
+ assert_return(ret, -EINVAL);
+
+ if (b)
+ *ret = JSON_VARIANT_MAGIC_TRUE;
+ else
+ *ret = JSON_VARIANT_MAGIC_FALSE;
+
+ return 0;
+}
+
+int json_variant_new_null(JsonVariant **ret) {
+ assert_return(ret, -EINVAL);
+
+ *ret = JSON_VARIANT_MAGIC_NULL;
+ return 0;
+}
+
+int json_variant_new_stringn(JsonVariant **ret, const char *s, size_t n) {
+ JsonVariant *v;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ if (!s) {
+ assert_return(IN_SET(n, 0, SIZE_MAX), -EINVAL);
+ return json_variant_new_null(ret);
+ }
+ if (n == SIZE_MAX) /* determine length automatically */
+ n = strlen(s);
+ else if (memchr(s, 0, n)) /* don't allow embedded NUL, as we can't express that in JSON */
+ return -EINVAL;
+ if (n == 0) {
+ *ret = JSON_VARIANT_MAGIC_EMPTY_STRING;
+ return 0;
+ }
+
+ if (!utf8_is_valid_n(s, n)) /* JSON strings must be valid UTF-8 */
+ return -EUCLEAN;
+
+ r = json_variant_new(&v, JSON_VARIANT_STRING, n + 1);
+ if (r < 0)
+ return r;
+
+ memcpy(v->string, s, n);
+ v->string[n] = 0;
+
+ *ret = v;
+ return 0;
+}
+
+int json_variant_new_base64(JsonVariant **ret, const void *p, size_t n) {
+ _cleanup_free_ char *s = NULL;
+ ssize_t k;
+
+ assert_return(ret, -EINVAL);
+ assert_return(n == 0 || p, -EINVAL);
+
+ k = base64mem(p, n, &s);
+ if (k < 0)
+ return k;
+
+ return json_variant_new_stringn(ret, s, k);
+}
+
+int json_variant_new_base32hex(JsonVariant **ret, const void *p, size_t n) {
+ _cleanup_free_ char *s = NULL;
+
+ assert_return(ret, -EINVAL);
+ assert_return(n == 0 || p, -EINVAL);
+
+ s = base32hexmem(p, n, false);
+ if (!s)
+ return -ENOMEM;
+
+ return json_variant_new_string(ret, s);
+}
+
+int json_variant_new_hex(JsonVariant **ret, const void *p, size_t n) {
+ _cleanup_free_ char *s = NULL;
+
+ assert_return(ret, -EINVAL);
+ assert_return(n == 0 || p, -EINVAL);
+
+ s = hexmem(p, n);
+ if (!s)
+ return -ENOMEM;
+
+ return json_variant_new_stringn(ret, s, n*2);
+}
+
+int json_variant_new_octescape(JsonVariant **ret, const void *p, size_t n) {
+ _cleanup_free_ char *s = NULL;
+
+ assert_return(ret, -EINVAL);
+ assert_return(n == 0 || p, -EINVAL);
+
+ s = octescape(p, n);
+ if (!s)
+ return -ENOMEM;
+
+ return json_variant_new_string(ret, s);
+}
+
+int json_variant_new_id128(JsonVariant **ret, sd_id128_t id) {
+ return json_variant_new_string(ret, SD_ID128_TO_STRING(id));
+}
+
+int json_variant_new_uuid(JsonVariant **ret, sd_id128_t id) {
+ return json_variant_new_string(ret, SD_ID128_TO_UUID_STRING(id));
+}
+
+static void json_variant_set(JsonVariant *a, JsonVariant *b) {
+ assert(a);
+
+ b = json_variant_dereference(b);
+ if (!b) {
+ a->type = JSON_VARIANT_NULL;
+ return;
+ }
+
+ a->type = json_variant_type(b);
+ switch (a->type) {
+
+ case JSON_VARIANT_INTEGER:
+ a->value.integer = json_variant_integer(b);
+ break;
+
+ case JSON_VARIANT_UNSIGNED:
+ a->value.unsig = json_variant_unsigned(b);
+ break;
+
+ case JSON_VARIANT_REAL:
+ a->value.real = json_variant_real(b);
+ break;
+
+ case JSON_VARIANT_BOOLEAN:
+ a->value.boolean = json_variant_boolean(b);
+ break;
+
+ case JSON_VARIANT_STRING: {
+ const char *s;
+
+ assert_se(s = json_variant_string(b));
+
+ /* Short strings we can store inline */
+ if (strnlen(s, INLINE_STRING_MAX+1) <= INLINE_STRING_MAX) {
+ strcpy(a->string, s);
+ break;
+ }
+
+ /* For longer strings, use a reference… */
+ _fallthrough_;
+ }
+
+ case JSON_VARIANT_ARRAY:
+ case JSON_VARIANT_OBJECT:
+ a->is_reference = true;
+ a->reference = json_variant_ref(json_variant_conservative_formalize(b));
+ break;
+
+ case JSON_VARIANT_NULL:
+ break;
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static void json_variant_copy_source(JsonVariant *v, JsonVariant *from) {
+ assert(v);
+
+ if (!json_variant_is_regular(from))
+ return;
+
+ v->line = from->line;
+ v->column = from->column;
+ v->source = json_source_ref(from->source);
+}
+
+static int _json_variant_array_put_element(JsonVariant *array, JsonVariant *element) {
+ assert(array);
+ JsonVariant *w = array + 1 + array->n_elements;
+
+ uint16_t d = json_variant_depth(element);
+ if (d >= DEPTH_MAX) /* Refuse too deep nesting */
+ return -ELNRNG;
+ if (d >= array->depth)
+ array->depth = d + 1;
+ array->n_elements ++;
+
+ *w = (JsonVariant) {
+ .is_embedded = true,
+ .parent = array,
+ };
+
+ json_variant_set(w, element);
+ json_variant_copy_source(w, element);
+
+ if (!json_variant_is_normalized(element))
+ array->normalized = false;
+
+ return 0;
+}
+
+int json_variant_new_array(JsonVariant **ret, JsonVariant **array, size_t n) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ if (n == 0) {
+ *ret = JSON_VARIANT_MAGIC_EMPTY_ARRAY;
+ return 0;
+ }
+ assert_return(array, -EINVAL);
+
+ v = new(JsonVariant, n + 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (JsonVariant) {
+ .n_ref = 1,
+ .type = JSON_VARIANT_ARRAY,
+ .normalized = true,
+ };
+
+ while (v->n_elements < n) {
+ r = _json_variant_array_put_element(v, array[v->n_elements]);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(v);
+ return 0;
+}
+
+int json_variant_new_array_bytes(JsonVariant **ret, const void *p, size_t n) {
+ assert_return(ret, -EINVAL);
+ if (n == 0) {
+ *ret = JSON_VARIANT_MAGIC_EMPTY_ARRAY;
+ return 0;
+ }
+ assert_return(p, -EINVAL);
+
+ JsonVariant *v = new(JsonVariant, n + 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (JsonVariant) {
+ .n_ref = 1,
+ .type = JSON_VARIANT_ARRAY,
+ .n_elements = n,
+ .depth = 1,
+ };
+
+ for (size_t i = 0; i < n; i++) {
+ JsonVariant *w = v + 1 + i;
+
+ *w = (JsonVariant) {
+ .is_embedded = true,
+ .parent = v,
+ .type = JSON_VARIANT_UNSIGNED,
+ .value.unsig = ((const uint8_t*) p)[i],
+ };
+ }
+
+ v->normalized = true;
+
+ *ret = v;
+ return 0;
+}
+
+int json_variant_new_array_strv(JsonVariant **ret, char **l) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ size_t n;
+ int r;
+
+ assert(ret);
+
+ n = strv_length(l);
+ if (n == 0) {
+ *ret = JSON_VARIANT_MAGIC_EMPTY_ARRAY;
+ return 0;
+ }
+
+ v = new(JsonVariant, n + 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (JsonVariant) {
+ .n_ref = 1,
+ .type = JSON_VARIANT_ARRAY,
+ .depth = 1,
+ };
+
+ for (v->n_elements = 0; v->n_elements < n; v->n_elements++) {
+ JsonVariant *w = v + 1 + v->n_elements;
+ size_t k;
+
+ *w = (JsonVariant) {
+ .is_embedded = true,
+ .parent = v,
+ .type = JSON_VARIANT_STRING,
+ };
+
+ k = strlen(l[v->n_elements]);
+
+ if (k > INLINE_STRING_MAX) {
+ /* If string is too long, store it as reference. */
+
+ r = json_variant_new_string(&w->reference, l[v->n_elements]);
+ if (r < 0)
+ return r;
+
+ w->is_reference = true;
+ } else {
+ if (!utf8_is_valid_n(l[v->n_elements], k)) /* JSON strings must be valid UTF-8 */
+ return -EUCLEAN;
+
+ memcpy(w->string, l[v->n_elements], k+1);
+ }
+ }
+
+ v->normalized = true;
+
+ *ret = TAKE_PTR(v);
+ return 0;
+}
+
+int json_variant_new_object(JsonVariant **ret, JsonVariant **array, size_t n) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ const char *prev = NULL;
+ bool sorted = true, normalized = true;
+
+ assert_return(ret, -EINVAL);
+ if (n == 0) {
+ *ret = JSON_VARIANT_MAGIC_EMPTY_OBJECT;
+ return 0;
+ }
+ assert_return(array, -EINVAL);
+ assert_return(n % 2 == 0, -EINVAL);
+
+ v = new(JsonVariant, n + 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (JsonVariant) {
+ .n_ref = 1,
+ .type = JSON_VARIANT_OBJECT,
+ };
+
+ for (v->n_elements = 0; v->n_elements < n; v->n_elements++) {
+ JsonVariant *w = v + 1 + v->n_elements,
+ *c = array[v->n_elements];
+ uint16_t d;
+
+ if ((v->n_elements & 1) == 0) {
+ const char *k;
+
+ if (!json_variant_is_string(c))
+ return -EINVAL; /* Every second one needs to be a string, as it is the key name */
+
+ assert_se(k = json_variant_string(c));
+
+ if (prev && strcmp(k, prev) <= 0)
+ sorted = normalized = false;
+
+ prev = k;
+ } else if (!json_variant_is_normalized(c))
+ normalized = false;
+
+ d = json_variant_depth(c);
+ if (d >= DEPTH_MAX) /* Refuse too deep nesting */
+ return -ELNRNG;
+ if (d >= v->depth)
+ v->depth = d + 1;
+
+ *w = (JsonVariant) {
+ .is_embedded = true,
+ .parent = v,
+ };
+
+ json_variant_set(w, c);
+ json_variant_copy_source(w, c);
+ }
+
+ v->normalized = normalized;
+ v->sorted = sorted;
+
+ *ret = TAKE_PTR(v);
+ return 0;
+}
+
+static size_t json_variant_size(JsonVariant* v) {
+ if (!json_variant_is_regular(v))
+ return 0;
+
+ if (v->is_reference)
+ return offsetof(JsonVariant, reference) + sizeof(JsonVariant*);
+
+ switch (v->type) {
+
+ case JSON_VARIANT_STRING:
+ return offsetof(JsonVariant, string) + strlen(v->string) + 1;
+
+ case JSON_VARIANT_REAL:
+ return offsetof(JsonVariant, value) + sizeof(double);
+
+ case JSON_VARIANT_UNSIGNED:
+ return offsetof(JsonVariant, value) + sizeof(uint64_t);
+
+ case JSON_VARIANT_INTEGER:
+ return offsetof(JsonVariant, value) + sizeof(int64_t);
+
+ case JSON_VARIANT_BOOLEAN:
+ return offsetof(JsonVariant, value) + sizeof(bool);
+
+ case JSON_VARIANT_ARRAY:
+ case JSON_VARIANT_OBJECT:
+ return offsetof(JsonVariant, n_elements) + sizeof(size_t);
+
+ case JSON_VARIANT_NULL:
+ return offsetof(JsonVariant, value);
+
+ default:
+ assert_not_reached();
+ }
+}
+
+static void json_variant_free_inner(JsonVariant *v, bool force_sensitive) {
+ bool sensitive;
+
+ assert(v);
+
+ if (!json_variant_is_regular(v))
+ return;
+
+ json_source_unref(v->source);
+
+ sensitive = v->sensitive || force_sensitive;
+
+ if (v->is_reference) {
+ if (sensitive)
+ json_variant_sensitive(v->reference);
+
+ json_variant_unref(v->reference);
+ return;
+ }
+
+ if (IN_SET(v->type, JSON_VARIANT_ARRAY, JSON_VARIANT_OBJECT))
+ for (size_t i = 0; i < v->n_elements; i++)
+ json_variant_free_inner(v + 1 + i, sensitive);
+
+ if (sensitive)
+ explicit_bzero_safe(v, json_variant_size(v));
+}
+
+static unsigned json_variant_n_ref(const JsonVariant *v) {
+ /* Return the number of references to v.
+ * 0 => NULL or not a regular object or embedded.
+ * >0 => number of references
+ */
+
+ if (!v || !json_variant_is_regular(v) || v->is_embedded)
+ return 0;
+
+ assert(v->n_ref > 0);
+ return v->n_ref;
+}
+
+JsonVariant *json_variant_ref(JsonVariant *v) {
+ if (!v)
+ return NULL;
+ if (!json_variant_is_regular(v))
+ return v;
+
+ if (v->is_embedded)
+ json_variant_ref(v->parent); /* ref the compounding variant instead */
+ else {
+ assert(v->n_ref > 0);
+ v->n_ref++;
+ }
+
+ return v;
+}
+
+JsonVariant *json_variant_unref(JsonVariant *v) {
+ if (!v)
+ return NULL;
+ if (!json_variant_is_regular(v))
+ return NULL;
+
+ if (v->is_embedded)
+ json_variant_unref(v->parent);
+ else {
+ assert(v->n_ref > 0);
+ v->n_ref--;
+
+ if (v->n_ref == 0) {
+ json_variant_free_inner(v, false);
+ free(v);
+ }
+ }
+
+ return NULL;
+}
+
+void json_variant_unref_many(JsonVariant **array, size_t n) {
+ assert(array || n == 0);
+
+ for (size_t i = 0; i < n; i++)
+ json_variant_unref(array[i]);
+
+ free(array);
+}
+
+const char *json_variant_string(JsonVariant *v) {
+ if (!v)
+ return NULL;
+ if (v == JSON_VARIANT_MAGIC_EMPTY_STRING)
+ return "";
+ if (json_variant_is_magic(v))
+ goto mismatch;
+ if (json_variant_is_const_string(v)) {
+ uintptr_t p = (uintptr_t) v;
+
+ assert((p & 1) != 0);
+ return (const char*) (p ^ 1U);
+ }
+
+ if (v->is_reference)
+ return json_variant_string(v->reference);
+ if (v->type != JSON_VARIANT_STRING)
+ goto mismatch;
+
+ return v->string;
+
+mismatch:
+ log_debug("Non-string JSON variant requested as string, returning NULL.");
+ return NULL;
+}
+
+bool json_variant_boolean(JsonVariant *v) {
+ if (!v)
+ goto mismatch;
+ if (v == JSON_VARIANT_MAGIC_TRUE)
+ return true;
+ if (v == JSON_VARIANT_MAGIC_FALSE)
+ return false;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (v->type != JSON_VARIANT_BOOLEAN)
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_boolean(v->reference);
+
+ return v->value.boolean;
+
+mismatch:
+ log_debug("Non-boolean JSON variant requested as boolean, returning false.");
+ return false;
+}
+
+int64_t json_variant_integer(JsonVariant *v) {
+ if (!v)
+ goto mismatch;
+ if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER ||
+ v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED ||
+ v == JSON_VARIANT_MAGIC_ZERO_REAL)
+ return 0;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_integer(v->reference);
+
+ switch (v->type) {
+
+ case JSON_VARIANT_INTEGER:
+ return v->value.integer;
+
+ case JSON_VARIANT_UNSIGNED:
+ if (v->value.unsig <= INT64_MAX)
+ return (int64_t) v->value.unsig;
+
+ log_debug("Unsigned integer %" PRIu64 " requested as signed integer and out of range, returning 0.", v->value.unsig);
+ return 0;
+
+ case JSON_VARIANT_REAL: {
+ int64_t converted;
+
+ converted = (int64_t) v->value.real;
+
+ if (fp_equal((double) converted, v->value.real))
+ return converted;
+
+ log_debug("Real %g requested as integer, and cannot be converted losslessly, returning 0.", v->value.real);
+ return 0;
+ }
+
+ default:
+ break;
+ }
+
+mismatch:
+ log_debug("Non-integer JSON variant requested as integer, returning 0.");
+ return 0;
+}
+
+uint64_t json_variant_unsigned(JsonVariant *v) {
+ if (!v)
+ goto mismatch;
+ if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER ||
+ v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED ||
+ v == JSON_VARIANT_MAGIC_ZERO_REAL)
+ return 0;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_integer(v->reference);
+
+ switch (v->type) {
+
+ case JSON_VARIANT_INTEGER:
+ if (v->value.integer >= 0)
+ return (uint64_t) v->value.integer;
+
+ log_debug("Signed integer %" PRIi64 " requested as unsigned integer and out of range, returning 0.", v->value.integer);
+ return 0;
+
+ case JSON_VARIANT_UNSIGNED:
+ return v->value.unsig;
+
+ case JSON_VARIANT_REAL: {
+ uint64_t converted;
+
+ converted = (uint64_t) v->value.real;
+
+ if (fp_equal((double) converted, v->value.real))
+ return converted;
+
+ log_debug("Real %g requested as unsigned integer, and cannot be converted losslessly, returning 0.", v->value.real);
+ return 0;
+ }
+
+ default:
+ break;
+ }
+
+mismatch:
+ log_debug("Non-integer JSON variant requested as unsigned, returning 0.");
+ return 0;
+}
+
+double json_variant_real(JsonVariant *v) {
+ if (!v)
+ return 0.0;
+ if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER ||
+ v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED ||
+ v == JSON_VARIANT_MAGIC_ZERO_REAL)
+ return 0.0;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_real(v->reference);
+
+ switch (v->type) {
+
+ case JSON_VARIANT_REAL:
+ return v->value.real;
+
+ case JSON_VARIANT_INTEGER: {
+ double converted = (double) v->value.integer;
+
+ if ((int64_t) converted == v->value.integer)
+ return converted;
+
+ log_debug("Signed integer %" PRIi64 " requested as real, and cannot be converted losslessly, returning 0.", v->value.integer);
+ return 0.0;
+ }
+
+ case JSON_VARIANT_UNSIGNED: {
+ double converted = (double) v->value.unsig;
+
+ if ((uint64_t) converted == v->value.unsig)
+ return converted;
+
+ log_debug("Unsigned integer %" PRIu64 " requested as real, and cannot be converted losslessly, returning 0.", v->value.unsig);
+ return 0.0;
+ }
+
+ default:
+ break;
+ }
+
+mismatch:
+ log_debug("Non-integer JSON variant requested as integer, returning 0.");
+ return 0.0;
+}
+
+bool json_variant_is_negative(JsonVariant *v) {
+ if (!v)
+ goto mismatch;
+ if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER ||
+ v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED ||
+ v == JSON_VARIANT_MAGIC_ZERO_REAL)
+ return false;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_is_negative(v->reference);
+
+ /* This function is useful as checking whether numbers are negative is pretty complex since we have three types
+ * of numbers. And some JSON code (OCI for example) uses negative numbers to mark "not defined" numeric
+ * values. */
+
+ switch (v->type) {
+
+ case JSON_VARIANT_REAL:
+ return v->value.real < 0;
+
+ case JSON_VARIANT_INTEGER:
+ return v->value.integer < 0;
+
+ case JSON_VARIANT_UNSIGNED:
+ return false;
+
+ default:
+ break;
+ }
+
+mismatch:
+ log_debug("Non-integer JSON variant tested for negativity, returning false.");
+ return false;
+}
+
+bool json_variant_is_blank_object(JsonVariant *v) {
+ /* Returns true if the specified object is null or empty */
+ return !v ||
+ json_variant_is_null(v) ||
+ (json_variant_is_object(v) && json_variant_elements(v) == 0);
+}
+
+bool json_variant_is_blank_array(JsonVariant *v) {
+ return !v ||
+ json_variant_is_null(v) ||
+ (json_variant_is_array(v) && json_variant_elements(v) == 0);
+}
+
+JsonVariantType json_variant_type(JsonVariant *v) {
+
+ if (!v)
+ return _JSON_VARIANT_TYPE_INVALID;
+
+ if (json_variant_is_const_string(v))
+ return JSON_VARIANT_STRING;
+
+ if (v == JSON_VARIANT_MAGIC_TRUE || v == JSON_VARIANT_MAGIC_FALSE)
+ return JSON_VARIANT_BOOLEAN;
+
+ if (v == JSON_VARIANT_MAGIC_NULL)
+ return JSON_VARIANT_NULL;
+
+ if (v == JSON_VARIANT_MAGIC_ZERO_INTEGER)
+ return JSON_VARIANT_INTEGER;
+
+ if (v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED)
+ return JSON_VARIANT_UNSIGNED;
+
+ if (v == JSON_VARIANT_MAGIC_ZERO_REAL)
+ return JSON_VARIANT_REAL;
+
+ if (v == JSON_VARIANT_MAGIC_EMPTY_STRING)
+ return JSON_VARIANT_STRING;
+
+ if (v == JSON_VARIANT_MAGIC_EMPTY_ARRAY)
+ return JSON_VARIANT_ARRAY;
+
+ if (v == JSON_VARIANT_MAGIC_EMPTY_OBJECT)
+ return JSON_VARIANT_OBJECT;
+
+ return v->type;
+}
+
+_function_no_sanitize_float_cast_overflow_
+bool json_variant_has_type(JsonVariant *v, JsonVariantType type) {
+ JsonVariantType rt;
+
+ /* Note: we turn off ubsan float cast overflow detection for this function, since it would complain
+ * about our float casts but we do them explicitly to detect conversion errors. */
+
+ v = json_variant_dereference(v);
+ if (!v)
+ return false;
+
+ rt = json_variant_type(v);
+ if (rt == type)
+ return true;
+
+ /* If it's a const string, then it only can be a string, and if it is not, it's not */
+ if (json_variant_is_const_string(v))
+ return false;
+
+ /* All three magic zeroes qualify as integer, unsigned and as real */
+ if ((v == JSON_VARIANT_MAGIC_ZERO_INTEGER || v == JSON_VARIANT_MAGIC_ZERO_UNSIGNED || v == JSON_VARIANT_MAGIC_ZERO_REAL) &&
+ IN_SET(type, JSON_VARIANT_INTEGER, JSON_VARIANT_UNSIGNED, JSON_VARIANT_REAL, JSON_VARIANT_NUMBER))
+ return true;
+
+ /* All other magic variant types are only equal to themselves */
+ if (json_variant_is_magic(v))
+ return false;
+
+ /* Handle the "number" pseudo type */
+ if (type == JSON_VARIANT_NUMBER)
+ return IN_SET(rt, JSON_VARIANT_INTEGER, JSON_VARIANT_UNSIGNED, JSON_VARIANT_REAL);
+
+ /* Integer conversions are OK in many cases */
+ if (rt == JSON_VARIANT_INTEGER && type == JSON_VARIANT_UNSIGNED)
+ return v->value.integer >= 0;
+ if (rt == JSON_VARIANT_UNSIGNED && type == JSON_VARIANT_INTEGER)
+ return v->value.unsig <= INT64_MAX;
+
+ /* Any integer that can be converted lossley to a real and back may also be considered a real */
+ if (rt == JSON_VARIANT_INTEGER && type == JSON_VARIANT_REAL)
+ return (int64_t) (double) v->value.integer == v->value.integer;
+ if (rt == JSON_VARIANT_UNSIGNED && type == JSON_VARIANT_REAL)
+ return (uint64_t) (double) v->value.unsig == v->value.unsig;
+
+ /* Any real that can be converted losslessly to an integer and back may also be considered an integer */
+ if (rt == JSON_VARIANT_REAL && type == JSON_VARIANT_INTEGER)
+ return fp_equal((double) (int64_t) v->value.real, v->value.real);
+ if (rt == JSON_VARIANT_REAL && type == JSON_VARIANT_UNSIGNED)
+ return fp_equal((double) (uint64_t) v->value.real, v->value.real);
+
+ return false;
+}
+
+size_t json_variant_elements(JsonVariant *v) {
+ if (!v)
+ return 0;
+ if (v == JSON_VARIANT_MAGIC_EMPTY_ARRAY ||
+ v == JSON_VARIANT_MAGIC_EMPTY_OBJECT)
+ return 0;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (!IN_SET(v->type, JSON_VARIANT_ARRAY, JSON_VARIANT_OBJECT))
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_elements(v->reference);
+
+ return v->n_elements;
+
+mismatch:
+ log_debug("Number of elements in non-array/non-object JSON variant requested, returning 0.");
+ return 0;
+}
+
+JsonVariant *json_variant_by_index(JsonVariant *v, size_t idx) {
+ if (!v)
+ return NULL;
+ if (v == JSON_VARIANT_MAGIC_EMPTY_ARRAY ||
+ v == JSON_VARIANT_MAGIC_EMPTY_OBJECT)
+ return NULL;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (!IN_SET(v->type, JSON_VARIANT_ARRAY, JSON_VARIANT_OBJECT))
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_by_index(v->reference, idx);
+ if (idx >= v->n_elements)
+ return NULL;
+
+ return json_variant_conservative_formalize(v + 1 + idx);
+
+mismatch:
+ log_debug("Element in non-array/non-object JSON variant requested by index, returning NULL.");
+ return NULL;
+}
+
+JsonVariant *json_variant_by_key_full(JsonVariant *v, const char *key, JsonVariant **ret_key) {
+ if (!v)
+ goto not_found;
+ if (!key)
+ goto not_found;
+ if (v == JSON_VARIANT_MAGIC_EMPTY_OBJECT)
+ goto not_found;
+ if (!json_variant_is_regular(v))
+ goto mismatch;
+ if (v->type != JSON_VARIANT_OBJECT)
+ goto mismatch;
+ if (v->is_reference)
+ return json_variant_by_key(v->reference, key);
+
+ if (v->sorted) {
+ size_t a = 0, b = v->n_elements/2;
+
+ /* If the variant is sorted we can use bisection to find the entry we need in O(log(n)) time */
+
+ while (b > a) {
+ JsonVariant *p;
+ const char *f;
+ size_t i;
+ int c;
+
+ i = (a + b) / 2;
+ p = json_variant_dereference(v + 1 + i*2);
+
+ assert_se(f = json_variant_string(p));
+
+ c = strcmp(key, f);
+ if (c == 0) {
+ if (ret_key)
+ *ret_key = json_variant_conservative_formalize(v + 1 + i*2);
+
+ return json_variant_conservative_formalize(v + 1 + i*2 + 1);
+ } else if (c < 0)
+ b = i;
+ else
+ a = i + 1;
+ }
+
+ goto not_found;
+ }
+
+ /* The variant is not sorted, hence search for the field linearly */
+ for (size_t i = 0; i < v->n_elements; i += 2) {
+ JsonVariant *p;
+
+ p = json_variant_dereference(v + 1 + i);
+
+ if (!json_variant_has_type(p, JSON_VARIANT_STRING))
+ continue;
+
+ if (streq(json_variant_string(p), key)) {
+
+ if (ret_key)
+ *ret_key = json_variant_conservative_formalize(v + 1 + i);
+
+ return json_variant_conservative_formalize(v + 1 + i + 1);
+ }
+ }
+
+not_found:
+ if (ret_key)
+ *ret_key = NULL;
+
+ return NULL;
+
+mismatch:
+ log_debug("Element in non-object JSON variant requested by key, returning NULL.");
+ if (ret_key)
+ *ret_key = NULL;
+
+ return NULL;
+}
+
+JsonVariant *json_variant_by_key(JsonVariant *v, const char *key) {
+ return json_variant_by_key_full(v, key, NULL);
+}
+
+bool json_variant_equal(JsonVariant *a, JsonVariant *b) {
+ JsonVariantType t;
+
+ a = json_variant_formalize(a);
+ b = json_variant_formalize(b);
+
+ if (a == b)
+ return true;
+
+ t = json_variant_type(a);
+ if (!json_variant_has_type(b, t))
+ return false;
+
+ switch (t) {
+
+ case JSON_VARIANT_STRING:
+ return streq(json_variant_string(a), json_variant_string(b));
+
+ case JSON_VARIANT_INTEGER:
+ return json_variant_integer(a) == json_variant_integer(b);
+
+ case JSON_VARIANT_UNSIGNED:
+ return json_variant_unsigned(a) == json_variant_unsigned(b);
+
+ case JSON_VARIANT_REAL:
+ return fp_equal(json_variant_real(a), json_variant_real(b));
+
+ case JSON_VARIANT_BOOLEAN:
+ return json_variant_boolean(a) == json_variant_boolean(b);
+
+ case JSON_VARIANT_NULL:
+ return true;
+
+ case JSON_VARIANT_ARRAY: {
+ size_t n = json_variant_elements(a);
+ if (n != json_variant_elements(b))
+ return false;
+
+ for (size_t i = 0; i < n; i++)
+ if (!json_variant_equal(json_variant_by_index(a, i), json_variant_by_index(b, i)))
+ return false;
+
+ return true;
+ }
+
+ case JSON_VARIANT_OBJECT: {
+ size_t n = json_variant_elements(a);
+ if (n != json_variant_elements(b))
+ return false;
+
+ /* Iterate through all keys in 'a' */
+ for (size_t i = 0; i < n; i += 2) {
+ bool found = false;
+
+ /* Match them against all keys in 'b' */
+ for (size_t j = 0; j < n; j += 2) {
+ JsonVariant *key_b;
+
+ key_b = json_variant_by_index(b, j);
+
+ /* During the first iteration unmark everything */
+ if (i == 0)
+ key_b->is_marked = false;
+ else if (key_b->is_marked) /* In later iterations if we already marked something, don't bother with it again */
+ continue;
+
+ if (found)
+ continue;
+
+ if (json_variant_equal(json_variant_by_index(a, i), key_b) &&
+ json_variant_equal(json_variant_by_index(a, i+1), json_variant_by_index(b, j+1))) {
+ /* Key and values match! */
+ key_b->is_marked = found = true;
+
+ /* In the first iteration we continue the inner loop since we want to mark
+ * everything, otherwise exit the loop quickly after we found what we were
+ * looking for. */
+ if (i != 0)
+ break;
+ }
+ }
+
+ if (!found)
+ return false;
+ }
+
+ return true;
+ }
+
+ default:
+ assert_not_reached();
+ }
+}
+
+void json_variant_sensitive(JsonVariant *v) {
+ assert(v);
+
+ /* Marks a variant as "sensitive", so that it is erased from memory when it is destroyed. This is a
+ * one-way operation: as soon as it is marked this way it remains marked this way until it's
+ * destroyed. A magic variant is never sensitive though, even when asked, since it's too
+ * basic. Similar, const string variant are never sensitive either, after all they are included in
+ * the source code as they are, which is not suitable for inclusion of secrets.
+ *
+ * Note that this flag has a recursive effect: when we destroy an object or array we'll propagate the
+ * flag to all contained variants. And if those are then destroyed this is propagated further down,
+ * and so on. */
+
+ v = json_variant_formalize(v);
+ if (!json_variant_is_regular(v))
+ return;
+
+ v->sensitive = true;
+}
+
+bool json_variant_is_sensitive(JsonVariant *v) {
+ v = json_variant_formalize(v);
+ if (!json_variant_is_regular(v))
+ return false;
+
+ return v->sensitive;
+}
+
+static void json_variant_propagate_sensitive(JsonVariant *from, JsonVariant *to) {
+ if (json_variant_is_sensitive(from))
+ json_variant_sensitive(to);
+}
+
+int json_variant_get_source(JsonVariant *v, const char **ret_source, unsigned *ret_line, unsigned *ret_column) {
+ assert_return(v, -EINVAL);
+
+ if (ret_source)
+ *ret_source = json_variant_is_regular(v) && v->source ? v->source->name : NULL;
+
+ if (ret_line)
+ *ret_line = json_variant_is_regular(v) ? v->line : 0;
+
+ if (ret_column)
+ *ret_column = json_variant_is_regular(v) ? v->column : 0;
+
+ return 0;
+}
+
+static int print_source(FILE *f, JsonVariant *v, JsonFormatFlags flags, bool whitespace) {
+ size_t w, k;
+
+ if (!FLAGS_SET(flags, JSON_FORMAT_SOURCE|JSON_FORMAT_PRETTY))
+ return 0;
+
+ if (!json_variant_is_regular(v))
+ return 0;
+
+ if (!v->source && v->line == 0 && v->column == 0)
+ return 0;
+
+ /* The max width we need to format the line numbers for this source file */
+ w = (v->source && v->source->max_line > 0) ?
+ DECIMAL_STR_WIDTH(v->source->max_line) :
+ DECIMAL_STR_MAX(unsigned)-1;
+ k = (v->source && v->source->max_column > 0) ?
+ DECIMAL_STR_WIDTH(v->source->max_column) :
+ DECIMAL_STR_MAX(unsigned) -1;
+
+ if (whitespace) {
+ size_t n = 1 + (v->source ? strlen(v->source->name) : 0) +
+ ((v->source && (v->line > 0 || v->column > 0)) ? 1 : 0) +
+ (v->line > 0 ? w : 0) +
+ (((v->source || v->line > 0) && v->column > 0) ? 1 : 0) +
+ (v->column > 0 ? k : 0) +
+ 2;
+
+ for (size_t i = 0; i < n; i++)
+ fputc(' ', f);
+ } else {
+ fputc('[', f);
+
+ if (v->source)
+ fputs(v->source->name, f);
+ if (v->source && (v->line > 0 || v->column > 0))
+ fputc(':', f);
+ if (v->line > 0)
+ fprintf(f, "%*u", (int) w, v->line);
+ if ((v->source || v->line > 0) || v->column > 0)
+ fputc(':', f);
+ if (v->column > 0)
+ fprintf(f, "%*u", (int) k, v->column);
+
+ fputc(']', f);
+ fputc(' ', f);
+ }
+
+ return 0;
+}
+
+static void json_format_string(FILE *f, const char *q, JsonFormatFlags flags) {
+ assert(q);
+
+ fputc('"', f);
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ansi_green(), f);
+
+ for (; *q; q++)
+ switch (*q) {
+ case '"':
+ fputs("\\\"", f);
+ break;
+
+ case '\\':
+ fputs("\\\\", f);
+ break;
+
+ case '\b':
+ fputs("\\b", f);
+ break;
+
+ case '\f':
+ fputs("\\f", f);
+ break;
+
+ case '\n':
+ fputs("\\n", f);
+ break;
+
+ case '\r':
+ fputs("\\r", f);
+ break;
+
+ case '\t':
+ fputs("\\t", f);
+ break;
+
+ default:
+ if ((signed char) *q >= 0 && *q < ' ')
+ fprintf(f, "\\u%04x", (unsigned) *q);
+ else
+ fputc(*q, f);
+ break;
+ }
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_NORMAL, f);
+
+ fputc('"', f);
+}
+
+static int json_format(FILE *f, JsonVariant *v, JsonFormatFlags flags, const char *prefix) {
+ int r;
+
+ assert(f);
+ assert(v);
+
+ switch (json_variant_type(v)) {
+
+ case JSON_VARIANT_REAL: {
+ locale_t loc, old_loc;
+
+ loc = newlocale(LC_NUMERIC_MASK, "C", (locale_t) 0);
+ if (loc == (locale_t) 0)
+ return -errno;
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ansi_highlight_blue(), f);
+
+ old_loc = uselocale(loc);
+ fprintf(f, "%.*e", DECIMAL_DIG, json_variant_real(v));
+ uselocale(old_loc);
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_NORMAL, f);
+
+ freelocale(loc);
+ break;
+ }
+
+ case JSON_VARIANT_INTEGER:
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ansi_highlight_blue(), f);
+
+ fprintf(f, "%" PRIdMAX, json_variant_integer(v));
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_NORMAL, f);
+ break;
+
+ case JSON_VARIANT_UNSIGNED:
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ansi_highlight_blue(), f);
+
+ fprintf(f, "%" PRIuMAX, json_variant_unsigned(v));
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_NORMAL, f);
+ break;
+
+ case JSON_VARIANT_BOOLEAN:
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_HIGHLIGHT, f);
+
+ if (json_variant_boolean(v))
+ fputs("true", f);
+ else
+ fputs("false", f);
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_NORMAL, f);
+
+ break;
+
+ case JSON_VARIANT_NULL:
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_HIGHLIGHT, f);
+
+ fputs("null", f);
+
+ if (flags & JSON_FORMAT_COLOR)
+ fputs(ANSI_NORMAL, f);
+ break;
+
+ case JSON_VARIANT_STRING:
+ json_format_string(f, json_variant_string(v), flags);
+ break;
+
+ case JSON_VARIANT_ARRAY: {
+ size_t n = json_variant_elements(v);
+ if (n == 0)
+ fputs("[]", f);
+ else {
+ _cleanup_free_ char *joined = NULL;
+ const char *prefix2;
+
+ if (flags & JSON_FORMAT_PRETTY) {
+ joined = strjoin(strempty(prefix), "\t");
+ if (!joined)
+ return -ENOMEM;
+
+ prefix2 = joined;
+ fputs("[\n", f);
+ } else {
+ prefix2 = strempty(prefix);
+ fputc('[', f);
+ }
+
+ for (size_t i = 0; i < n; i++) {
+ JsonVariant *e;
+
+ assert_se(e = json_variant_by_index(v, i));
+
+ if (i > 0) {
+ if (flags & JSON_FORMAT_PRETTY)
+ fputs(",\n", f);
+ else
+ fputc(',', f);
+ }
+
+ if (flags & JSON_FORMAT_PRETTY) {
+ print_source(f, e, flags, false);
+ fputs(prefix2, f);
+ }
+
+ r = json_format(f, e, flags, prefix2);
+ if (r < 0)
+ return r;
+ }
+
+ if (flags & JSON_FORMAT_PRETTY) {
+ fputc('\n', f);
+ print_source(f, v, flags, true);
+ fputs(strempty(prefix), f);
+ }
+
+ fputc(']', f);
+ }
+ break;
+ }
+
+ case JSON_VARIANT_OBJECT: {
+ size_t n = json_variant_elements(v);
+ if (n == 0)
+ fputs("{}", f);
+ else {
+ _cleanup_free_ char *joined = NULL;
+ const char *prefix2;
+
+ if (flags & JSON_FORMAT_PRETTY) {
+ joined = strjoin(strempty(prefix), "\t");
+ if (!joined)
+ return -ENOMEM;
+
+ prefix2 = joined;
+ fputs("{\n", f);
+ } else {
+ prefix2 = strempty(prefix);
+ fputc('{', f);
+ }
+
+ for (size_t i = 0; i < n; i += 2) {
+ JsonVariant *e;
+
+ e = json_variant_by_index(v, i);
+
+ if (i > 0) {
+ if (flags & JSON_FORMAT_PRETTY)
+ fputs(",\n", f);
+ else
+ fputc(',', f);
+ }
+
+ if (flags & JSON_FORMAT_PRETTY) {
+ print_source(f, e, flags, false);
+ fputs(prefix2, f);
+ }
+
+ r = json_format(f, e, flags, prefix2);
+ if (r < 0)
+ return r;
+
+ fputs(flags & JSON_FORMAT_PRETTY ? " : " : ":", f);
+
+ r = json_format(f, json_variant_by_index(v, i+1), flags, prefix2);
+ if (r < 0)
+ return r;
+ }
+
+ if (flags & JSON_FORMAT_PRETTY) {
+ fputc('\n', f);
+ print_source(f, v, flags, true);
+ fputs(strempty(prefix), f);
+ }
+
+ fputc('}', f);
+ }
+ break;
+ }
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+int json_variant_format(JsonVariant *v, JsonFormatFlags flags, char **ret) {
+ _cleanup_(memstream_done) MemStream m = {};
+ size_t sz;
+ FILE *f;
+ int r;
+
+ /* Returns the length of the generated string (without the terminating NUL),
+ * or negative on error. */
+
+ assert_return(v, -EINVAL);
+ assert_return(ret, -EINVAL);
+
+ if (flags & JSON_FORMAT_OFF)
+ return -ENOEXEC;
+
+ f = memstream_init(&m);
+ if (!f)
+ return -ENOMEM;
+
+ r = json_variant_dump(v, flags, f, NULL);
+ if (r < 0)
+ return r;
+
+ r = memstream_finalize(&m, ret, &sz);
+ if (r < 0)
+ return r;
+
+ return sz;
+}
+
+int json_variant_dump(JsonVariant *v, JsonFormatFlags flags, FILE *f, const char *prefix) {
+ if (!v) {
+ if (flags & JSON_FORMAT_EMPTY_ARRAY)
+ v = JSON_VARIANT_MAGIC_EMPTY_ARRAY;
+ else
+ return 0;
+ }
+
+ if (!f)
+ f = stdout;
+
+ print_source(f, v, flags, false);
+
+ if (((flags & (JSON_FORMAT_COLOR_AUTO|JSON_FORMAT_COLOR)) == JSON_FORMAT_COLOR_AUTO) && colors_enabled())
+ flags |= JSON_FORMAT_COLOR;
+
+ if (((flags & (JSON_FORMAT_PRETTY_AUTO|JSON_FORMAT_PRETTY)) == JSON_FORMAT_PRETTY_AUTO))
+ flags |= on_tty() ? JSON_FORMAT_PRETTY : JSON_FORMAT_NEWLINE;
+
+ if (flags & JSON_FORMAT_SSE)
+ fputs("data: ", f);
+ if (flags & JSON_FORMAT_SEQ)
+ fputc('\x1e', f); /* ASCII Record Separator */
+
+ json_format(f, v, flags, prefix);
+
+ if (flags & (JSON_FORMAT_PRETTY|JSON_FORMAT_SEQ|JSON_FORMAT_SSE|JSON_FORMAT_NEWLINE))
+ fputc('\n', f);
+ if (flags & JSON_FORMAT_SSE)
+ fputc('\n', f); /* In case of SSE add a second newline */
+
+ if (flags & JSON_FORMAT_FLUSH)
+ return fflush_and_check(f);
+ return 0;
+}
+
+int json_variant_filter(JsonVariant **v, char **to_remove) {
+ _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+ _cleanup_free_ JsonVariant **array = NULL;
+ size_t n = 0, k = 0;
+ int r;
+
+ assert(v);
+
+ if (json_variant_is_blank_object(*v))
+ return 0;
+ if (!json_variant_is_object(*v))
+ return -EINVAL;
+
+ if (strv_isempty(to_remove))
+ return 0;
+
+ for (size_t i = 0; i < json_variant_elements(*v); i += 2) {
+ JsonVariant *p;
+
+ p = json_variant_by_index(*v, i);
+ if (!json_variant_has_type(p, JSON_VARIANT_STRING))
+ return -EINVAL;
+
+ if (strv_contains(to_remove, json_variant_string(p))) {
+ if (!array) {
+ array = new(JsonVariant*, json_variant_elements(*v) - 2);
+ if (!array)
+ return -ENOMEM;
+
+ for (k = 0; k < i; k++)
+ array[k] = json_variant_by_index(*v, k);
+ }
+
+ n++;
+ } else if (array) {
+ array[k++] = p;
+ array[k++] = json_variant_by_index(*v, i + 1);
+ }
+ }
+
+ if (n == 0)
+ return 0;
+
+ r = json_variant_new_object(&w, array, k);
+ if (r < 0)
+ return r;
+
+ json_variant_propagate_sensitive(*v, w);
+ JSON_VARIANT_REPLACE(*v, TAKE_PTR(w));
+
+ return (int) n;
+}
+
+int json_variant_set_field(JsonVariant **v, const char *field, JsonVariant *value) {
+ _cleanup_(json_variant_unrefp) JsonVariant *field_variant = NULL, *w = NULL;
+ _cleanup_free_ JsonVariant **array = NULL;
+ size_t k = 0;
+ int r;
+
+ assert(v);
+ assert(field);
+
+ if (json_variant_is_blank_object(*v)) {
+ array = new(JsonVariant*, 2);
+ if (!array)
+ return -ENOMEM;
+
+ } else {
+ if (!json_variant_is_object(*v))
+ return -EINVAL;
+
+ for (size_t i = 0; i < json_variant_elements(*v); i += 2) {
+ JsonVariant *p;
+
+ p = json_variant_by_index(*v, i);
+ if (!json_variant_is_string(p))
+ return -EINVAL;
+
+ if (streq(json_variant_string(p), field)) {
+
+ if (!array) {
+ array = new(JsonVariant*, json_variant_elements(*v));
+ if (!array)
+ return -ENOMEM;
+
+ for (k = 0; k < i; k++)
+ array[k] = json_variant_by_index(*v, k);
+ }
+
+ } else if (array) {
+ array[k++] = p;
+ array[k++] = json_variant_by_index(*v, i + 1);
+ }
+ }
+
+ if (!array) {
+ array = new(JsonVariant*, json_variant_elements(*v) + 2);
+ if (!array)
+ return -ENOMEM;
+
+ for (k = 0; k < json_variant_elements(*v); k++)
+ array[k] = json_variant_by_index(*v, k);
+ }
+ }
+
+ r = json_variant_new_string(&field_variant, field);
+ if (r < 0)
+ return r;
+
+ array[k++] = field_variant;
+ array[k++] = value;
+
+ r = json_variant_new_object(&w, array, k);
+ if (r < 0)
+ return r;
+
+ json_variant_propagate_sensitive(*v, w);
+ JSON_VARIANT_REPLACE(*v, TAKE_PTR(w));
+
+ return 1;
+}
+
+int json_variant_set_fieldb(JsonVariant **v, const char *field, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+ va_list ap;
+ int r;
+
+ va_start(ap, field);
+ r = json_buildv(&w, ap);
+ va_end(ap);
+ if (r < 0)
+ return r;
+
+ return json_variant_set_field(v, field, w);
+}
+
+int json_variant_set_field_string(JsonVariant **v, const char *field, const char *value) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ r = json_variant_new_string(&m, value);
+ if (r < 0)
+ return r;
+
+ return json_variant_set_field(v, field, m);
+}
+
+int json_variant_set_field_integer(JsonVariant **v, const char *field, int64_t i) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ r = json_variant_new_integer(&m, i);
+ if (r < 0)
+ return r;
+
+ return json_variant_set_field(v, field, m);
+}
+
+int json_variant_set_field_unsigned(JsonVariant **v, const char *field, uint64_t u) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ r = json_variant_new_unsigned(&m, u);
+ if (r < 0)
+ return r;
+
+ return json_variant_set_field(v, field, m);
+}
+
+int json_variant_set_field_boolean(JsonVariant **v, const char *field, bool b) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ r = json_variant_new_boolean(&m, b);
+ if (r < 0)
+ return r;
+
+ return json_variant_set_field(v, field, m);
+}
+
+int json_variant_set_field_strv(JsonVariant **v, const char *field, char **l) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ r = json_variant_new_array_strv(&m, l);
+ if (r < 0)
+ return r;
+
+ return json_variant_set_field(v, field, m);
+}
+
+int json_variant_merge_object(JsonVariant **v, JsonVariant *m) {
+ _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+ _cleanup_free_ JsonVariant **array = NULL;
+ size_t v_elements, m_elements, k;
+ bool v_blank, m_blank;
+ int r;
+
+ m = json_variant_dereference(m);
+
+ v_blank = json_variant_is_blank_object(*v);
+ m_blank = json_variant_is_blank_object(m);
+
+ if (!v_blank && !json_variant_is_object(*v))
+ return -EINVAL;
+ if (!m_blank && !json_variant_is_object(m))
+ return -EINVAL;
+
+ if (m_blank)
+ return 0; /* nothing to do */
+
+ if (v_blank) {
+ JSON_VARIANT_REPLACE(*v, json_variant_ref(m));
+ return 1;
+ }
+
+ v_elements = json_variant_elements(*v);
+ m_elements = json_variant_elements(m);
+ if (v_elements > SIZE_MAX - m_elements) /* overflow check */
+ return -ENOMEM;
+
+ array = new(JsonVariant*, v_elements + m_elements);
+ if (!array)
+ return -ENOMEM;
+
+ k = 0;
+ for (size_t i = 0; i < v_elements; i += 2) {
+ JsonVariant *u;
+
+ u = json_variant_by_index(*v, i);
+ if (!json_variant_is_string(u))
+ return -EINVAL;
+
+ if (json_variant_by_key(m, json_variant_string(u)))
+ continue; /* skip if exists in second variant */
+
+ array[k++] = u;
+ array[k++] = json_variant_by_index(*v, i + 1);
+ }
+
+ for (size_t i = 0; i < m_elements; i++)
+ array[k++] = json_variant_by_index(m, i);
+
+ r = json_variant_new_object(&w, array, k);
+ if (r < 0)
+ return r;
+
+ json_variant_propagate_sensitive(*v, w);
+ json_variant_propagate_sensitive(m, w);
+ JSON_VARIANT_REPLACE(*v, TAKE_PTR(w));
+
+ return 1;
+}
+
+int json_variant_merge_objectb(JsonVariant **v, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+ va_list ap;
+ int r;
+
+ va_start(ap, v);
+ r = json_buildv(&w, ap);
+ va_end(ap);
+ if (r < 0)
+ return r;
+
+ return json_variant_merge_object(v, w);
+}
+
+int json_variant_append_array(JsonVariant **v, JsonVariant *element) {
+ _cleanup_(json_variant_unrefp) JsonVariant *nv = NULL;
+ bool blank;
+ int r;
+
+ assert(v);
+ assert(element);
+
+ if (!*v || json_variant_is_null(*v))
+ blank = true;
+ else if (json_variant_is_array(*v))
+ blank = json_variant_elements(*v) == 0;
+ else
+ return -EINVAL;
+
+ if (blank) {
+ r = json_variant_new_array(&nv, (JsonVariant*[]) { element }, 1);
+ if (r < 0)
+ return r;
+ } else if (json_variant_n_ref(*v) == 1) {
+ /* Let's bump the reference count on element. We can't do the realloc if we're appending *v
+ * to itself, or one of the objects embedded in *v to *v. If the reference count grows, we
+ * need to fall back to the other method below. */
+
+ _unused_ _cleanup_(json_variant_unrefp) JsonVariant *dummy = json_variant_ref(element);
+ if (json_variant_n_ref(*v) == 1) {
+ /* We hold the only reference. Let's mutate the object. */
+ size_t size = json_variant_elements(*v);
+ void *old = *v;
+
+ if (!GREEDY_REALLOC(*v, size + 1 + 1))
+ return -ENOMEM;
+
+ if (old != *v)
+ /* Readjust the parent pointers to the new address */
+ for (size_t i = 1; i < size; i++)
+ (*v)[1 + i].parent = *v;
+
+ return _json_variant_array_put_element(*v, element);
+ }
+ }
+
+ if (!blank) {
+ size_t size = json_variant_elements(*v);
+
+ _cleanup_free_ JsonVariant **array = new(JsonVariant*, size + 1);
+ if (!array)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < size; i++)
+ array[i] = json_variant_by_index(*v, i);
+
+ array[size] = element;
+
+ r = json_variant_new_array(&nv, array, size + 1);
+ if (r < 0)
+ return r;
+ }
+
+ json_variant_propagate_sensitive(*v, nv);
+ JSON_VARIANT_REPLACE(*v, TAKE_PTR(nv));
+
+ return 0;
+}
+
+int json_variant_append_arrayb(JsonVariant **v, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+ va_list ap;
+ int r;
+
+ va_start(ap, v);
+ r = json_buildv(&w, ap);
+ va_end(ap);
+ if (r < 0)
+ return r;
+
+ return json_variant_append_array(v, w);
+}
+
+JsonVariant *json_variant_find(JsonVariant *haystack, JsonVariant *needle) {
+ JsonVariant *i;
+
+ /* Find a json object in an array. Returns NULL if not found, or if the array is not actually an array. */
+
+ JSON_VARIANT_ARRAY_FOREACH(i, haystack)
+ if (json_variant_equal(i, needle))
+ return i;
+
+ return NULL;
+}
+
+int json_variant_append_array_nodup(JsonVariant **v, JsonVariant *element) {
+ assert(v);
+
+ if (json_variant_find(*v, element))
+ return 0;
+
+ return json_variant_append_array(v, element);
+}
+
+int json_variant_strv(JsonVariant *v, char ***ret) {
+ char **l = NULL;
+ bool sensitive;
+ int r;
+
+ assert(ret);
+
+ if (!v || json_variant_is_null(v)) {
+ l = new0(char*, 1);
+ if (!l)
+ return -ENOMEM;
+
+ *ret = l;
+ return 0;
+ }
+
+ if (!json_variant_is_array(v))
+ return -EINVAL;
+
+ sensitive = json_variant_is_sensitive(v);
+
+ size_t n = json_variant_elements(v);
+ l = new(char*, n+1);
+ if (!l)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < n; i++) {
+ JsonVariant *e;
+
+ assert_se(e = json_variant_by_index(v, i));
+ sensitive = sensitive || json_variant_is_sensitive(e);
+
+ if (!json_variant_is_string(e)) {
+ l[i] = NULL;
+ r = -EINVAL;
+ goto fail;
+ }
+
+ l[i] = strdup(json_variant_string(e));
+ if (!l[i]) {
+ r = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ l[n] = NULL;
+ *ret = TAKE_PTR(l);
+
+ return 0;
+
+fail:
+ if (sensitive)
+ strv_free_erase(l);
+ else
+ strv_free(l);
+
+ return r;
+}
+
+static int json_variant_copy(JsonVariant **nv, JsonVariant *v) {
+ JsonVariantType t;
+ JsonVariant *c;
+ JsonValue value;
+ const void *source;
+ size_t k;
+
+ assert(nv);
+ assert(v);
+
+ /* Let's copy the simple types literally, and the larger types by references */
+ t = json_variant_type(v);
+ switch (t) {
+ case JSON_VARIANT_INTEGER:
+ k = sizeof(int64_t);
+ value.integer = json_variant_integer(v);
+ source = &value;
+ break;
+
+ case JSON_VARIANT_UNSIGNED:
+ k = sizeof(uint64_t);
+ value.unsig = json_variant_unsigned(v);
+ source = &value;
+ break;
+
+ case JSON_VARIANT_REAL:
+ k = sizeof(double);
+ value.real = json_variant_real(v);
+ source = &value;
+ break;
+
+ case JSON_VARIANT_BOOLEAN:
+ k = sizeof(bool);
+ value.boolean = json_variant_boolean(v);
+ source = &value;
+ break;
+
+ case JSON_VARIANT_NULL:
+ k = 0;
+ source = NULL;
+ break;
+
+ case JSON_VARIANT_STRING:
+ source = json_variant_string(v);
+ k = strnlen(source, INLINE_STRING_MAX + 1);
+ if (k <= INLINE_STRING_MAX) {
+ k ++;
+ break;
+ }
+
+ _fallthrough_;
+
+ default:
+ /* Everything else copy by reference */
+
+ c = malloc0(MAX(sizeof(JsonVariant),
+ offsetof(JsonVariant, reference) + sizeof(JsonVariant*)));
+ if (!c)
+ return -ENOMEM;
+
+ c->n_ref = 1;
+ c->type = t;
+ c->is_reference = true;
+ c->reference = json_variant_ref(json_variant_formalize(v));
+
+ *nv = c;
+ return 0;
+ }
+
+ c = malloc0(MAX(sizeof(JsonVariant),
+ offsetof(JsonVariant, value) + k));
+ if (!c)
+ return -ENOMEM;
+
+ c->n_ref = 1;
+ c->type = t;
+
+ memcpy_safe(&c->value, source, k);
+
+ json_variant_propagate_sensitive(v, c);
+
+ *nv = c;
+ return 0;
+}
+
+static bool json_single_ref(JsonVariant *v) {
+
+ /* Checks whether the caller is the single owner of the object, i.e. can get away with changing it */
+
+ if (!json_variant_is_regular(v))
+ return false;
+
+ if (v->is_embedded)
+ return json_single_ref(v->parent);
+
+ assert(v->n_ref > 0);
+ return v->n_ref == 1;
+}
+
+static int json_variant_set_source(JsonVariant **v, JsonSource *source, unsigned line, unsigned column) {
+ JsonVariant *w;
+ int r;
+
+ assert(v);
+
+ /* Patch in source and line/column number. Tries to do this in-place if the caller is the sole
+ * referencer of the object. If not, allocates a new object, possibly a surrogate for the original
+ * one */
+
+ if (!*v)
+ return 0;
+
+ if (source && line > source->max_line)
+ source->max_line = line;
+ if (source && column > source->max_column)
+ source->max_column = column;
+
+ if (!json_variant_is_regular(*v)) {
+
+ if (!source && line == 0 && column == 0)
+ return 0;
+
+ } else {
+ if (json_source_equal((*v)->source, source) &&
+ (*v)->line == line &&
+ (*v)->column == column)
+ return 0;
+
+ if (json_single_ref(*v)) { /* Sole reference? */
+ json_source_unref((*v)->source);
+ (*v)->source = json_source_ref(source);
+ (*v)->line = line;
+ (*v)->column = column;
+ return 1;
+ }
+ }
+
+ r = json_variant_copy(&w, *v);
+ if (r < 0)
+ return r;
+
+ assert(json_variant_is_regular(w));
+ assert(!w->is_embedded);
+ assert(w->n_ref == 1);
+ assert(!w->source);
+
+ w->source = json_source_ref(source);
+ w->line = line;
+ w->column = column;
+
+ JSON_VARIANT_REPLACE(*v, w);
+
+ return 1;
+}
+
+static void inc_lines_columns(unsigned *line, unsigned *column, const char *s, size_t n) {
+ assert(line);
+ assert(column);
+ assert(s || n == 0);
+
+ while (n > 0) {
+ if (*s == '\n') {
+ (*line)++;
+ *column = 1;
+ } else if ((signed char) *s >= 0 && *s < 127) /* Process ASCII chars quickly */
+ (*column)++;
+ else {
+ int w;
+
+ w = utf8_encoded_valid_unichar(s, n);
+ if (w < 0) /* count invalid unichars as normal characters */
+ w = 1;
+ else if ((size_t) w > n) /* never read more than the specified number of characters */
+ w = (int) n;
+
+ (*column)++;
+
+ s += w;
+ n -= w;
+ continue;
+ }
+
+ s++;
+ n--;
+ }
+}
+
+static int unhex_ucs2(const char *c, uint16_t *ret) {
+ int aa, bb, cc, dd;
+ uint16_t x;
+
+ assert(c);
+ assert(ret);
+
+ aa = unhexchar(c[0]);
+ if (aa < 0)
+ return -EINVAL;
+
+ bb = unhexchar(c[1]);
+ if (bb < 0)
+ return -EINVAL;
+
+ cc = unhexchar(c[2]);
+ if (cc < 0)
+ return -EINVAL;
+
+ dd = unhexchar(c[3]);
+ if (dd < 0)
+ return -EINVAL;
+
+ x = ((uint16_t) aa << 12) |
+ ((uint16_t) bb << 8) |
+ ((uint16_t) cc << 4) |
+ ((uint16_t) dd);
+
+ if (x <= 0)
+ return -EINVAL;
+
+ *ret = x;
+
+ return 0;
+}
+
+static int json_parse_string(const char **p, char **ret) {
+ _cleanup_free_ char *s = NULL;
+ size_t n = 0;
+ const char *c;
+
+ assert(p);
+ assert(*p);
+ assert(ret);
+
+ c = *p;
+
+ if (*c != '"')
+ return -EINVAL;
+
+ c++;
+
+ for (;;) {
+ int len;
+
+ /* Check for EOF */
+ if (*c == 0)
+ return -EINVAL;
+
+ /* Check for control characters 0x00..0x1f */
+ if (*c > 0 && *c < ' ')
+ return -EINVAL;
+
+ /* Check for control character 0x7f */
+ if (*c == 0x7f)
+ return -EINVAL;
+
+ if (*c == '"') {
+ if (!s) {
+ s = strdup("");
+ if (!s)
+ return -ENOMEM;
+ } else
+ s[n] = 0;
+
+ *p = c + 1;
+
+ *ret = TAKE_PTR(s);
+ return JSON_TOKEN_STRING;
+ }
+
+ if (*c == '\\') {
+ char ch = 0;
+ c++;
+
+ if (*c == 0)
+ return -EINVAL;
+
+ if (IN_SET(*c, '"', '\\', '/'))
+ ch = *c;
+ else if (*c == 'b')
+ ch = '\b';
+ else if (*c == 'f')
+ ch = '\f';
+ else if (*c == 'n')
+ ch = '\n';
+ else if (*c == 'r')
+ ch = '\r';
+ else if (*c == 't')
+ ch = '\t';
+ else if (*c == 'u') {
+ char16_t x;
+ int r;
+
+ r = unhex_ucs2(c + 1, &x);
+ if (r < 0)
+ return r;
+
+ c += 5;
+
+ if (!GREEDY_REALLOC(s, n + 5))
+ return -ENOMEM;
+
+ if (!utf16_is_surrogate(x))
+ n += utf8_encode_unichar(s + n, (char32_t) x);
+ else if (utf16_is_trailing_surrogate(x))
+ return -EINVAL;
+ else {
+ char16_t y;
+
+ if (c[0] != '\\' || c[1] != 'u')
+ return -EINVAL;
+
+ r = unhex_ucs2(c + 2, &y);
+ if (r < 0)
+ return r;
+
+ c += 6;
+
+ if (!utf16_is_trailing_surrogate(y))
+ return -EINVAL;
+
+ n += utf8_encode_unichar(s + n, utf16_surrogate_pair_to_unichar(x, y));
+ }
+
+ continue;
+ } else
+ return -EINVAL;
+
+ if (!GREEDY_REALLOC(s, n + 2))
+ return -ENOMEM;
+
+ s[n++] = ch;
+ c ++;
+ continue;
+ }
+
+ len = utf8_encoded_valid_unichar(c, SIZE_MAX);
+ if (len < 0)
+ return len;
+
+ if (!GREEDY_REALLOC(s, n + len + 1))
+ return -ENOMEM;
+
+ memcpy(s + n, c, len);
+ n += len;
+ c += len;
+ }
+}
+
+static int json_parse_number(const char **p, JsonValue *ret) {
+ bool negative = false, exponent_negative = false, is_real = false;
+ double x = 0.0, y = 0.0, exponent = 0.0, shift = 1.0;
+ int64_t i = 0;
+ uint64_t u = 0;
+ const char *c;
+
+ assert(p);
+ assert(*p);
+ assert(ret);
+
+ c = *p;
+
+ if (*c == '-') {
+ negative = true;
+ c++;
+ }
+
+ if (*c == '0')
+ c++;
+ else {
+ if (!strchr("123456789", *c) || *c == 0)
+ return -EINVAL;
+
+ do {
+ if (!is_real) {
+ if (negative) {
+
+ if (i < INT64_MIN / 10) /* overflow */
+ is_real = true;
+ else {
+ int64_t t = 10 * i;
+
+ if (t < INT64_MIN + (*c - '0')) /* overflow */
+ is_real = true;
+ else
+ i = t - (*c - '0');
+ }
+ } else {
+ if (u > UINT64_MAX / 10) /* overflow */
+ is_real = true;
+ else {
+ uint64_t t = 10 * u;
+
+ if (t > UINT64_MAX - (*c - '0')) /* overflow */
+ is_real = true;
+ else
+ u = t + (*c - '0');
+ }
+ }
+ }
+
+ x = 10.0 * x + (*c - '0');
+
+ c++;
+ } while (strchr("0123456789", *c) && *c != 0);
+ }
+
+ if (*c == '.') {
+ is_real = true;
+ c++;
+
+ if (!strchr("0123456789", *c) || *c == 0)
+ return -EINVAL;
+
+ do {
+ y = 10.0 * y + (*c - '0');
+ shift = 10.0 * shift;
+ c++;
+ } while (strchr("0123456789", *c) && *c != 0);
+ }
+
+ if (IN_SET(*c, 'e', 'E')) {
+ is_real = true;
+ c++;
+
+ if (*c == '-') {
+ exponent_negative = true;
+ c++;
+ } else if (*c == '+')
+ c++;
+
+ if (!strchr("0123456789", *c) || *c == 0)
+ return -EINVAL;
+
+ do {
+ exponent = 10.0 * exponent + (*c - '0');
+ c++;
+ } while (strchr("0123456789", *c) && *c != 0);
+ }
+
+ *p = c;
+
+ if (is_real) {
+ ret->real = ((negative ? -1.0 : 1.0) * (x + (y / shift))) * exp10((exponent_negative ? -1.0 : 1.0) * exponent);
+ return JSON_TOKEN_REAL;
+ } else if (negative) {
+ ret->integer = i;
+ return JSON_TOKEN_INTEGER;
+ } else {
+ ret->unsig = u;
+ return JSON_TOKEN_UNSIGNED;
+ }
+}
+
+int json_tokenize(
+ const char **p,
+ char **ret_string,
+ JsonValue *ret_value,
+ unsigned *ret_line, /* 'ret_line' returns the line at the beginning of this token */
+ unsigned *ret_column,
+ void **state,
+ unsigned *line, /* 'line' is used as a line state, it always reflect the line we are at after the token was read */
+ unsigned *column) {
+
+ unsigned start_line, start_column;
+ const char *start, *c;
+ size_t n;
+ int t, r;
+
+ enum {
+ STATE_NULL,
+ STATE_VALUE,
+ STATE_VALUE_POST,
+ };
+
+ assert(p);
+ assert(*p);
+ assert(ret_string);
+ assert(ret_value);
+ assert(ret_line);
+ assert(ret_column);
+ assert(line);
+ assert(column);
+ assert(state);
+
+ t = PTR_TO_INT(*state);
+ if (t == STATE_NULL) {
+ *line = 1;
+ *column = 1;
+ t = STATE_VALUE;
+ }
+
+ /* Skip over the whitespace */
+ n = strspn(*p, WHITESPACE);
+ inc_lines_columns(line, column, *p, n);
+ c = *p + n;
+
+ /* Remember where we started processing this token */
+ start = c;
+ start_line = *line;
+ start_column = *column;
+
+ if (*c == 0) {
+ *ret_string = NULL;
+ *ret_value = JSON_VALUE_NULL;
+ r = JSON_TOKEN_END;
+ goto finish;
+ }
+
+ switch (t) {
+
+ case STATE_VALUE:
+
+ if (*c == '{') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE);
+ r = JSON_TOKEN_OBJECT_OPEN;
+ goto null_return;
+
+ } else if (*c == '}') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_OBJECT_CLOSE;
+ goto null_return;
+
+ } else if (*c == '[') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE);
+ r = JSON_TOKEN_ARRAY_OPEN;
+ goto null_return;
+
+ } else if (*c == ']') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_ARRAY_CLOSE;
+ goto null_return;
+
+ } else if (*c == '"') {
+
+ r = json_parse_string(&c, ret_string);
+ if (r < 0)
+ return r;
+
+ *ret_value = JSON_VALUE_NULL;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ goto finish;
+
+ } else if (strchr("-0123456789", *c)) {
+
+ r = json_parse_number(&c, ret_value);
+ if (r < 0)
+ return r;
+
+ *ret_string = NULL;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ goto finish;
+
+ } else if (startswith(c, "true")) {
+ *ret_string = NULL;
+ ret_value->boolean = true;
+ c += 4;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_BOOLEAN;
+ goto finish;
+
+ } else if (startswith(c, "false")) {
+ *ret_string = NULL;
+ ret_value->boolean = false;
+ c += 5;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_BOOLEAN;
+ goto finish;
+
+ } else if (startswith(c, "null")) {
+ *ret_string = NULL;
+ *ret_value = JSON_VALUE_NULL;
+ c += 4;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_NULL;
+ goto finish;
+
+ }
+
+ return -EINVAL;
+
+ case STATE_VALUE_POST:
+
+ if (*c == ':') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE);
+ r = JSON_TOKEN_COLON;
+ goto null_return;
+
+ } else if (*c == ',') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE);
+ r = JSON_TOKEN_COMMA;
+ goto null_return;
+
+ } else if (*c == '}') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_OBJECT_CLOSE;
+ goto null_return;
+
+ } else if (*c == ']') {
+ c++;
+ *state = INT_TO_PTR(STATE_VALUE_POST);
+ r = JSON_TOKEN_ARRAY_CLOSE;
+ goto null_return;
+ }
+
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+
+null_return:
+ *ret_string = NULL;
+ *ret_value = JSON_VALUE_NULL;
+
+finish:
+ inc_lines_columns(line, column, start, c - start);
+ *p = c;
+
+ *ret_line = start_line;
+ *ret_column = start_column;
+
+ return r;
+}
+
+typedef enum JsonExpect {
+ /* The following values are used by json_parse() */
+ EXPECT_TOPLEVEL,
+ EXPECT_END,
+ EXPECT_OBJECT_FIRST_KEY,
+ EXPECT_OBJECT_NEXT_KEY,
+ EXPECT_OBJECT_COLON,
+ EXPECT_OBJECT_VALUE,
+ EXPECT_OBJECT_COMMA,
+ EXPECT_ARRAY_FIRST_ELEMENT,
+ EXPECT_ARRAY_NEXT_ELEMENT,
+ EXPECT_ARRAY_COMMA,
+
+ /* And these are used by json_build() */
+ EXPECT_ARRAY_ELEMENT,
+ EXPECT_OBJECT_KEY,
+} JsonExpect;
+
+typedef struct JsonStack {
+ JsonExpect expect;
+ JsonVariant **elements;
+ size_t n_elements;
+ unsigned line_before;
+ unsigned column_before;
+ size_t n_suppress; /* When building: if > 0, suppress this many subsequent elements. If == SIZE_MAX, suppress all subsequent elements */
+} JsonStack;
+
+static void json_stack_release(JsonStack *s) {
+ assert(s);
+
+ CLEANUP_ARRAY(s->elements, s->n_elements, json_variant_unref_many);
+}
+
+static int json_parse_internal(
+ const char **input,
+ JsonSource *source,
+ JsonParseFlags flags,
+ JsonVariant **ret,
+ unsigned *line,
+ unsigned *column,
+ bool continue_end) {
+
+ size_t n_stack = 1;
+ unsigned line_buffer = 0, column_buffer = 0;
+ void *tokenizer_state = NULL;
+ JsonStack *stack = NULL;
+ const char *p;
+ int r;
+
+ assert_return(input, -EINVAL);
+ assert_return(ret, -EINVAL);
+
+ p = *input;
+
+ if (!GREEDY_REALLOC(stack, n_stack))
+ return -ENOMEM;
+
+ stack[0] = (JsonStack) {
+ .expect = EXPECT_TOPLEVEL,
+ };
+
+ if (!line)
+ line = &line_buffer;
+ if (!column)
+ column = &column_buffer;
+
+ for (;;) {
+ _cleanup_(json_variant_unrefp) JsonVariant *add = NULL;
+ _cleanup_free_ char *string = NULL;
+ unsigned line_token, column_token;
+ JsonStack *current;
+ JsonValue value;
+ int token;
+
+ assert(n_stack > 0);
+ current = stack + n_stack - 1;
+
+ if (continue_end && current->expect == EXPECT_END)
+ goto done;
+
+ token = json_tokenize(&p, &string, &value, &line_token, &column_token, &tokenizer_state, line, column);
+ if (token < 0) {
+ r = token;
+ goto finish;
+ }
+
+ switch (token) {
+
+ case JSON_TOKEN_END:
+ if (current->expect != EXPECT_END) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert(current->n_elements == 1);
+ assert(n_stack == 1);
+ goto done;
+
+ case JSON_TOKEN_COLON:
+
+ if (current->expect != EXPECT_OBJECT_COLON) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ current->expect = EXPECT_OBJECT_VALUE;
+ break;
+
+ case JSON_TOKEN_COMMA:
+
+ if (current->expect == EXPECT_OBJECT_COMMA)
+ current->expect = EXPECT_OBJECT_NEXT_KEY;
+ else if (current->expect == EXPECT_ARRAY_COMMA)
+ current->expect = EXPECT_ARRAY_NEXT_ELEMENT;
+ else {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ break;
+
+ case JSON_TOKEN_OBJECT_OPEN:
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ if (!GREEDY_REALLOC(stack, n_stack+1)) {
+ r = -ENOMEM;
+ goto finish;
+ }
+ current = stack + n_stack - 1;
+
+ /* Prepare the expect for when we return from the child */
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ stack[n_stack++] = (JsonStack) {
+ .expect = EXPECT_OBJECT_FIRST_KEY,
+ .line_before = line_token,
+ .column_before = column_token,
+ };
+
+ current = stack + n_stack - 1;
+ break;
+
+ case JSON_TOKEN_OBJECT_CLOSE:
+ if (!IN_SET(current->expect, EXPECT_OBJECT_FIRST_KEY, EXPECT_OBJECT_COMMA)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert(n_stack > 1);
+
+ r = json_variant_new_object(&add, current->elements, current->n_elements);
+ if (r < 0)
+ goto finish;
+
+ line_token = current->line_before;
+ column_token = current->column_before;
+
+ json_stack_release(current);
+ n_stack--, current--;
+
+ break;
+
+ case JSON_TOKEN_ARRAY_OPEN:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ if (!GREEDY_REALLOC(stack, n_stack+1)) {
+ r = -ENOMEM;
+ goto finish;
+ }
+ current = stack + n_stack - 1;
+
+ /* Prepare the expect for when we return from the child */
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ stack[n_stack++] = (JsonStack) {
+ .expect = EXPECT_ARRAY_FIRST_ELEMENT,
+ .line_before = line_token,
+ .column_before = column_token,
+ };
+
+ break;
+
+ case JSON_TOKEN_ARRAY_CLOSE:
+ if (!IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_COMMA)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert(n_stack > 1);
+
+ r = json_variant_new_array(&add, current->elements, current->n_elements);
+ if (r < 0)
+ goto finish;
+
+ line_token = current->line_before;
+ column_token = current->column_before;
+
+ json_stack_release(current);
+ n_stack--, current--;
+ break;
+
+ case JSON_TOKEN_STRING:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_FIRST_KEY, EXPECT_OBJECT_NEXT_KEY, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ r = json_variant_new_string(&add, string);
+ if (r < 0)
+ goto finish;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (IN_SET(current->expect, EXPECT_OBJECT_FIRST_KEY, EXPECT_OBJECT_NEXT_KEY))
+ current->expect = EXPECT_OBJECT_COLON;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ break;
+
+ case JSON_TOKEN_REAL:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ r = json_variant_new_real(&add, value.real);
+ if (r < 0)
+ goto finish;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ break;
+
+ case JSON_TOKEN_INTEGER:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ r = json_variant_new_integer(&add, value.integer);
+ if (r < 0)
+ goto finish;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ break;
+
+ case JSON_TOKEN_UNSIGNED:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ r = json_variant_new_unsigned(&add, value.unsig);
+ if (r < 0)
+ goto finish;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ break;
+
+ case JSON_TOKEN_BOOLEAN:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ r = json_variant_new_boolean(&add, value.boolean);
+ if (r < 0)
+ goto finish;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ break;
+
+ case JSON_TOKEN_NULL:
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ r = json_variant_new_null(&add);
+ if (r < 0)
+ goto finish;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_COMMA;
+ else {
+ assert(IN_SET(current->expect, EXPECT_ARRAY_FIRST_ELEMENT, EXPECT_ARRAY_NEXT_ELEMENT));
+ current->expect = EXPECT_ARRAY_COMMA;
+ }
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (add) {
+ /* If we are asked to make this parsed object sensitive, then let's apply this
+ * immediately after allocating each variant, so that when we abort half-way
+ * everything we already allocated that is then freed is correctly marked. */
+ if (FLAGS_SET(flags, JSON_PARSE_SENSITIVE))
+ json_variant_sensitive(add);
+
+ (void) json_variant_set_source(&add, source, line_token, column_token);
+
+ if (!GREEDY_REALLOC(current->elements, current->n_elements + 1)) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ current->elements[current->n_elements++] = TAKE_PTR(add);
+ }
+ }
+
+done:
+ assert(n_stack == 1);
+ assert(stack[0].n_elements == 1);
+
+ *ret = json_variant_ref(stack[0].elements[0]);
+ *input = p;
+ r = 0;
+
+finish:
+ for (size_t i = 0; i < n_stack; i++)
+ json_stack_release(stack + i);
+
+ free(stack);
+
+ return r;
+}
+
+int json_parse_with_source(
+ const char *input,
+ const char *source,
+ JsonParseFlags flags,
+ JsonVariant **ret,
+ unsigned *ret_line,
+ unsigned *ret_column) {
+
+ _cleanup_(json_source_unrefp) JsonSource *s = NULL;
+
+ if (source) {
+ s = json_source_new(source);
+ if (!s)
+ return -ENOMEM;
+ }
+
+ return json_parse_internal(&input, s, flags, ret, ret_line, ret_column, false);
+}
+
+int json_parse_with_source_continue(
+ const char **p,
+ const char *source,
+ JsonParseFlags flags,
+ JsonVariant **ret,
+ unsigned *ret_line,
+ unsigned *ret_column) {
+
+ _cleanup_(json_source_unrefp) JsonSource *s = NULL;
+
+ if (source) {
+ s = json_source_new(source);
+ if (!s)
+ return -ENOMEM;
+ }
+
+ return json_parse_internal(p, s, flags, ret, ret_line, ret_column, true);
+}
+
+int json_parse_file_at(
+ FILE *f,
+ int dir_fd,
+ const char *path,
+ JsonParseFlags flags,
+ JsonVariant **ret,
+ unsigned *ret_line,
+ unsigned *ret_column) {
+
+ _cleanup_free_ char *text = NULL;
+ int r;
+
+ if (f)
+ r = read_full_stream(f, &text, NULL);
+ else if (path)
+ r = read_full_file_full(dir_fd, path, UINT64_MAX, SIZE_MAX, 0, NULL, &text, NULL);
+ else
+ return -EINVAL;
+ if (r < 0)
+ return r;
+
+ if (isempty(text))
+ return -ENODATA;
+
+ return json_parse_with_source(text, path, flags, ret, ret_line, ret_column);
+}
+
+int json_buildv(JsonVariant **ret, va_list ap) {
+ JsonStack *stack = NULL;
+ size_t n_stack = 1;
+ const char *name = NULL;
+ int r;
+
+ assert_return(ret, -EINVAL);
+
+ if (!GREEDY_REALLOC(stack, n_stack))
+ return -ENOMEM;
+
+ stack[0] = (JsonStack) {
+ .expect = EXPECT_TOPLEVEL,
+ };
+
+ for (;;) {
+ _cleanup_(json_variant_unrefp) JsonVariant *add = NULL, *add_more = NULL;
+ size_t n_subtract = 0; /* how much to subtract from current->n_suppress, i.e. how many elements would
+ * have been added to the current variant */
+ JsonStack *current;
+ int command;
+
+ assert(n_stack > 0);
+ current = stack + n_stack - 1;
+
+ if (current->expect == EXPECT_END)
+ goto done;
+
+ command = va_arg(ap, int);
+
+ switch (command) {
+
+ case _JSON_BUILD_STRING: {
+ const char *p;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ p = va_arg(ap, const char *);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_string(&add, p);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_INTEGER: {
+ int64_t j;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ j = va_arg(ap, int64_t);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_integer(&add, j);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_UNSIGNED: {
+ uint64_t j;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ j = va_arg(ap, uint64_t);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_unsigned(&add, j);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_REAL: {
+ double d;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ d = va_arg(ap, double);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_real(&add, d);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_BOOLEAN: {
+ bool b;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ b = va_arg(ap, int);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_boolean(&add, b);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_NULL:
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_null(&add);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+
+ case _JSON_BUILD_VARIANT:
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ /* Note that we don't care for current->n_suppress here, after all the variant is already
+ * allocated anyway... */
+ add = va_arg(ap, JsonVariant*);
+ if (!add)
+ add = JSON_VARIANT_MAGIC_NULL;
+ else
+ json_variant_ref(add);
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+
+ case _JSON_BUILD_VARIANT_ARRAY: {
+ JsonVariant **array;
+ size_t n;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ array = va_arg(ap, JsonVariant**);
+ n = va_arg(ap, size_t);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_array(&add, array, n);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_LITERAL: {
+ const char *l;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ l = va_arg(ap, const char *);
+
+ if (l) {
+ /* Note that we don't care for current->n_suppress here, we should generate parsing
+ * errors even in suppressed object properties */
+
+ r = json_parse(l, 0, &add, NULL, NULL);
+ if (r < 0)
+ goto finish;
+ } else
+ add = JSON_VARIANT_MAGIC_NULL;
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_ARRAY_BEGIN:
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ if (!GREEDY_REALLOC(stack, n_stack+1)) {
+ r = -ENOMEM;
+ goto finish;
+ }
+ current = stack + n_stack - 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ stack[n_stack++] = (JsonStack) {
+ .expect = EXPECT_ARRAY_ELEMENT,
+ .n_suppress = current->n_suppress != 0 ? SIZE_MAX : 0, /* if we shall suppress the
+ * new array, then we should
+ * also suppress all array
+ * members */
+ };
+
+ break;
+
+ case _JSON_BUILD_ARRAY_END:
+ if (current->expect != EXPECT_ARRAY_ELEMENT) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert(n_stack > 1);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_array(&add, current->elements, current->n_elements);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ json_stack_release(current);
+ n_stack--, current--;
+
+ break;
+
+ case _JSON_BUILD_STRV: {
+ char **l;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ l = va_arg(ap, char **);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_array_strv(&add, l);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_STRV_ENV_PAIR: {
+ char **l;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ l = va_arg(ap, char **);
+
+ _cleanup_strv_free_ char **el = NULL;
+ STRV_FOREACH_PAIR(x, y, l) {
+ char *n = NULL;
+
+ n = strjoin(*x, "=", *y);
+ if (!n) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ r = strv_consume(&el, n);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_array_strv(&add, el);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_BASE64:
+ case _JSON_BUILD_BASE32HEX:
+ case _JSON_BUILD_HEX:
+ case _JSON_BUILD_OCTESCAPE: {
+ const void *p;
+ size_t n;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ p = va_arg(ap, const void *);
+ n = va_arg(ap, size_t);
+
+ if (current->n_suppress == 0) {
+ r = command == _JSON_BUILD_BASE64 ? json_variant_new_base64(&add, p, n) :
+ command == _JSON_BUILD_BASE32HEX ? json_variant_new_base32hex(&add, p, n) :
+ command == _JSON_BUILD_HEX ? json_variant_new_hex(&add, p, n) :
+ json_variant_new_octescape(&add, p, n);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_IOVEC_BASE64: {
+ const struct iovec *iov;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ iov = ASSERT_PTR(va_arg(ap, const struct iovec*));
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_base64(&add, iov->iov_base, iov->iov_len);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_ID128:
+ case _JSON_BUILD_UUID: {
+ const sd_id128_t *id;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert_se(id = va_arg(ap, sd_id128_t*));
+
+ if (current->n_suppress == 0) {
+ r = command == _JSON_BUILD_ID128 ?
+ json_variant_new_id128(&add, *id) :
+ json_variant_new_uuid(&add, *id);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_BYTE_ARRAY: {
+ const void *array;
+ size_t n;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ array = va_arg(ap, const void*);
+ n = va_arg(ap, size_t);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_array_bytes(&add, array, n);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_HW_ADDR: {
+ const struct hw_addr_data *hw_addr;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert_se(hw_addr = va_arg(ap, struct hw_addr_data*));
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_array_bytes(&add, hw_addr->bytes, hw_addr->length);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_STRING_SET: {
+ Set *set;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ set = va_arg(ap, Set*);
+
+ if (current->n_suppress == 0) {
+ _cleanup_free_ char **sorted = NULL;
+
+ r = set_dump_sorted(set, (void ***) &sorted, NULL);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_strv(&add, sorted);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_CALLBACK: {
+ JsonBuildCallback cb;
+ void *userdata;
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ cb = va_arg(ap, JsonBuildCallback);
+ userdata = va_arg(ap, void *);
+
+ if (current->n_suppress == 0) {
+ if (cb) {
+ r = cb(&add, name, userdata);
+ if (r < 0)
+ goto finish;
+ }
+
+ if (!add)
+ add = JSON_VARIANT_MAGIC_NULL;
+
+ name = NULL;
+ }
+
+ n_subtract = 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ break;
+ }
+
+ case _JSON_BUILD_OBJECT_BEGIN:
+
+ if (!IN_SET(current->expect, EXPECT_TOPLEVEL, EXPECT_OBJECT_VALUE, EXPECT_ARRAY_ELEMENT)) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ if (!GREEDY_REALLOC(stack, n_stack+1)) {
+ r = -ENOMEM;
+ goto finish;
+ }
+ current = stack + n_stack - 1;
+
+ if (current->expect == EXPECT_TOPLEVEL)
+ current->expect = EXPECT_END;
+ else if (current->expect == EXPECT_OBJECT_VALUE)
+ current->expect = EXPECT_OBJECT_KEY;
+ else
+ assert(current->expect == EXPECT_ARRAY_ELEMENT);
+
+ stack[n_stack++] = (JsonStack) {
+ .expect = EXPECT_OBJECT_KEY,
+ .n_suppress = current->n_suppress != 0 ? SIZE_MAX : 0, /* If we shall suppress the
+ * new object, then we should
+ * also suppress all object
+ * members. */
+ };
+
+ break;
+
+ case _JSON_BUILD_OBJECT_END:
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ assert(n_stack > 1);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_object(&add, current->elements, current->n_elements);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ json_stack_release(current);
+ n_stack--, current--;
+
+ break;
+
+ case _JSON_BUILD_PAIR: {
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ name = va_arg(ap, const char *);
+
+ if (current->n_suppress == 0) {
+ r = json_variant_new_string(&add, name);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1;
+
+ current->expect = EXPECT_OBJECT_VALUE;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_CONDITION: {
+ bool b;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ b = va_arg(ap, int);
+ name = va_arg(ap, const char *);
+
+ if (b && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, name);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 1; /* we generated one item */
+
+ if (!b && current->n_suppress != SIZE_MAX)
+ current->n_suppress += 2; /* Suppress this one and the next item */
+
+ current->expect = EXPECT_OBJECT_VALUE;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_UNSIGNED_NON_ZERO: {
+ const char *n;
+ uint64_t u;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ u = va_arg(ap, uint64_t);
+
+ if (u != 0 && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_unsigned(&add_more, u);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_FINITE_USEC: {
+ const char *n;
+ usec_t u;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ u = va_arg(ap, usec_t);
+
+ if (u != USEC_INFINITY && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_unsigned(&add_more, u);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_STRING_NON_EMPTY: {
+ const char *n, *s;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ s = va_arg(ap, const char *);
+
+ if (!isempty(s) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_string(&add_more, s);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_STRV_NON_EMPTY: {
+ const char *n;
+ char **l;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ l = va_arg(ap, char **);
+
+ if (!strv_isempty(l) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_strv(&add_more, l);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_VARIANT_NON_NULL: {
+ JsonVariant *v;
+ const char *n;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ v = va_arg(ap, JsonVariant *);
+
+ if (v && !json_variant_is_null(v) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ add_more = json_variant_ref(v);
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_IN4_ADDR_NON_NULL: {
+ const struct in_addr *a;
+ const char *n;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ a = va_arg(ap, const struct in_addr *);
+
+ if (a && in4_addr_is_set(a) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_bytes(&add_more, a, sizeof(struct in_addr));
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_IN6_ADDR_NON_NULL: {
+ const struct in6_addr *a;
+ const char *n;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ a = va_arg(ap, const struct in6_addr *);
+
+ if (a && in6_addr_is_set(a) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_bytes(&add_more, a, sizeof(struct in6_addr));
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_IN_ADDR_NON_NULL: {
+ const union in_addr_union *a;
+ const char *n;
+ int f;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ a = va_arg(ap, const union in_addr_union *);
+ f = va_arg(ap, int);
+
+ if (a && in_addr_is_set(f, a) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_bytes(&add_more, a->bytes, FAMILY_ADDRESS_SIZE(f));
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL: {
+ const struct ether_addr *a;
+ const char *n;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ a = va_arg(ap, const struct ether_addr *);
+
+ if (a && !ether_addr_is_null(a) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_bytes(&add_more, a->ether_addr_octet, sizeof(struct ether_addr));
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+
+ case _JSON_BUILD_PAIR_HW_ADDR_NON_NULL: {
+ const struct hw_addr_data *a;
+ const char *n;
+
+ if (current->expect != EXPECT_OBJECT_KEY) {
+ r = -EINVAL;
+ goto finish;
+ }
+
+ n = va_arg(ap, const char *);
+ a = va_arg(ap, const struct hw_addr_data *);
+
+ if (a && !hw_addr_is_null(a) && current->n_suppress == 0) {
+ r = json_variant_new_string(&add, n);
+ if (r < 0)
+ goto finish;
+
+ r = json_variant_new_array_bytes(&add_more, a->bytes, a->length);
+ if (r < 0)
+ goto finish;
+ }
+
+ n_subtract = 2; /* we generated two item */
+
+ current->expect = EXPECT_OBJECT_KEY;
+ break;
+ }
+ }
+
+ /* If variants were generated, add them to our current variant, but only if we are not supposed to suppress additions */
+ if (add && current->n_suppress == 0) {
+ if (!GREEDY_REALLOC(current->elements, current->n_elements + 1 + !!add_more)) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ current->elements[current->n_elements++] = TAKE_PTR(add);
+ if (add_more)
+ current->elements[current->n_elements++] = TAKE_PTR(add_more);
+ }
+
+ /* If we are supposed to suppress items, let's subtract how many items where generated from
+ * that counter. Except if the counter is SIZE_MAX, i.e. we shall suppress an infinite number
+ * of elements on this stack level */
+ if (current->n_suppress != SIZE_MAX) {
+ if (current->n_suppress <= n_subtract) /* Saturated */
+ current->n_suppress = 0;
+ else
+ current->n_suppress -= n_subtract;
+ }
+ }
+
+done:
+ assert(n_stack == 1);
+ assert(stack[0].n_elements == 1);
+
+ *ret = json_variant_ref(stack[0].elements[0]);
+ r = 0;
+
+finish:
+ for (size_t i = 0; i < n_stack; i++)
+ json_stack_release(stack + i);
+
+ free(stack);
+
+ return r;
+}
+
+int json_build(JsonVariant **ret, ...) {
+ va_list ap;
+ int r;
+
+ va_start(ap, ret);
+ r = json_buildv(ret, ap);
+ va_end(ap);
+
+ return r;
+}
+
+int json_log_internal(
+ JsonVariant *variant,
+ int level,
+ int error,
+ const char *file,
+ int line,
+ const char *func,
+ const char *format, ...) {
+
+ PROTECT_ERRNO;
+
+ unsigned source_line, source_column;
+ char buffer[LINE_MAX];
+ const char *source;
+ va_list ap;
+ int r;
+
+ errno = ERRNO_VALUE(error);
+
+ va_start(ap, format);
+ (void) vsnprintf(buffer, sizeof buffer, format, ap);
+ va_end(ap);
+
+ if (variant) {
+ r = json_variant_get_source(variant, &source, &source_line, &source_column);
+ if (r < 0)
+ return r;
+ } else {
+ source = NULL;
+ source_line = 0;
+ source_column = 0;
+ }
+
+ if (source && source_line > 0 && source_column > 0)
+ return log_struct_internal(
+ level,
+ error,
+ file, line, func,
+ "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR,
+ "CONFIG_FILE=%s", source,
+ "CONFIG_LINE=%u", source_line,
+ "CONFIG_COLUMN=%u", source_column,
+ LOG_MESSAGE("%s:%u:%u: %s", source, source_line, source_column, buffer),
+ NULL);
+ else if (source_line > 0 && source_column > 0)
+ return log_struct_internal(
+ level,
+ error,
+ file, line, func,
+ "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR,
+ "CONFIG_LINE=%u", source_line,
+ "CONFIG_COLUMN=%u", source_column,
+ LOG_MESSAGE("(string):%u:%u: %s", source_line, source_column, buffer),
+ NULL);
+ else
+ return log_struct_internal(
+ level,
+ error,
+ file, line, func,
+ "MESSAGE_ID=" SD_MESSAGE_INVALID_CONFIGURATION_STR,
+ LOG_MESSAGE("%s", buffer),
+ NULL);
+}
+
+static void *dispatch_userdata(const JsonDispatch *p, void *userdata) {
+
+ /* When the userdata pointer is passed in as NULL, then we'll just use the offset as a literal
+ * address, and convert it to a pointer. Note that might as well just add the offset to the NULL
+ * pointer, but UndefinedBehaviourSanitizer doesn't like pointer arithmetics based on NULL pointers,
+ * hence we code this explicitly here. */
+
+ if (userdata)
+ return (uint8_t*) userdata + p->offset;
+
+ return SIZE_TO_PTR(p->offset);
+}
+
+int json_dispatch_full(
+ JsonVariant *v,
+ const JsonDispatch table[],
+ JsonDispatchCallback bad,
+ JsonDispatchFlags flags,
+ void *userdata,
+ const char **reterr_bad_field) {
+ size_t m;
+ int r, done = 0;
+ bool *found;
+
+ if (!json_variant_is_object(v)) {
+ json_log(v, flags, 0, "JSON variant is not an object.");
+
+ if (flags & JSON_PERMISSIVE)
+ return 0;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = NULL;
+
+ return -EINVAL;
+ }
+
+ m = 0;
+ for (const JsonDispatch *p = table; p->name; p++)
+ m++;
+
+ found = newa0(bool, m);
+
+ size_t n = json_variant_elements(v);
+ for (size_t i = 0; i < n; i += 2) {
+ JsonVariant *key, *value;
+ const JsonDispatch *p;
+
+ assert_se(key = json_variant_by_index(v, i));
+ assert_se(value = json_variant_by_index(v, i+1));
+
+ for (p = table; p->name; p++)
+ if (p->name == POINTER_MAX ||
+ streq_ptr(json_variant_string(key), p->name))
+ break;
+
+ if (p->name) { /* Found a matching entry! 🙂 */
+ JsonDispatchFlags merged_flags;
+
+ merged_flags = flags | p->flags;
+
+ if (p->type != _JSON_VARIANT_TYPE_INVALID &&
+ !json_variant_has_type(value, p->type)) {
+
+ json_log(value, merged_flags, 0,
+ "Object field '%s' has wrong type %s, expected %s.", json_variant_string(key),
+ json_variant_type_to_string(json_variant_type(value)), json_variant_type_to_string(p->type));
+
+ if (merged_flags & JSON_PERMISSIVE)
+ continue;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = p->name;
+
+ return -EINVAL;
+ }
+
+ if (found[p-table]) {
+ json_log(value, merged_flags, 0, "Duplicate object field '%s'.", json_variant_string(key));
+
+ if (merged_flags & JSON_PERMISSIVE)
+ continue;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = p->name;
+
+ return -ENOTUNIQ;
+ }
+
+ found[p-table] = true;
+
+ if (p->callback) {
+ r = p->callback(json_variant_string(key), value, merged_flags, dispatch_userdata(p, userdata));
+ if (r < 0) {
+ if (merged_flags & JSON_PERMISSIVE)
+ continue;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = json_variant_string(key);
+
+ return r;
+ }
+ }
+
+ done ++;
+
+ } else { /* Didn't find a matching entry! ☹️ */
+
+ if (bad) {
+ r = bad(json_variant_string(key), value, flags, userdata);
+ if (r < 0) {
+ if (flags & JSON_PERMISSIVE)
+ continue;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = json_variant_string(key);
+
+ return r;
+ } else
+ done ++;
+
+ } else {
+ json_log(value, flags, 0, "Unexpected object field '%s'.", json_variant_string(key));
+
+ if (flags & JSON_PERMISSIVE)
+ continue;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = json_variant_string(key);
+
+ return -EADDRNOTAVAIL;
+ }
+ }
+ }
+
+ for (const JsonDispatch *p = table; p->name; p++) {
+ JsonDispatchFlags merged_flags = p->flags | flags;
+
+ if ((merged_flags & JSON_MANDATORY) && !found[p-table]) {
+ json_log(v, merged_flags, 0, "Missing object field '%s'.", p->name);
+
+ if ((merged_flags & JSON_PERMISSIVE))
+ continue;
+
+ if (reterr_bad_field)
+ *reterr_bad_field = p->name;
+
+ return -ENXIO;
+ }
+ }
+
+ return done;
+}
+
+int json_dispatch_boolean(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ bool *b = ASSERT_PTR(userdata);
+
+ assert(variant);
+
+ if (!json_variant_is_boolean(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a boolean.", strna(name));
+
+ *b = json_variant_boolean(variant);
+ return 0;
+}
+
+int json_dispatch_tristate(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ int *b = ASSERT_PTR(userdata);
+
+ assert(variant);
+
+ if (json_variant_is_null(variant)) {
+ *b = -1;
+ return 0;
+ }
+
+ if (!json_variant_is_boolean(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a boolean.", strna(name));
+
+ *b = json_variant_boolean(variant);
+ return 0;
+}
+
+int json_dispatch_int64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ int64_t *i = ASSERT_PTR(userdata);
+
+ assert(variant);
+
+ /* Also accept numbers formatted as string, to increase compatibility with less capable JSON
+ * implementations that cannot do 64bit integers. */
+ if (json_variant_is_string(variant) && safe_atoi64(json_variant_string(variant), i) >= 0)
+ return 0;
+
+ if (!json_variant_is_integer(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer, nor one formatted as decimal string.", strna(name));
+
+ *i = json_variant_integer(variant);
+ return 0;
+}
+
+int json_dispatch_uint64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint64_t *u = ASSERT_PTR(userdata);
+
+ assert(variant);
+
+ /* Since 64bit values (in particular unsigned ones) in JSON are problematic, let's also accept them
+ * formatted as strings. If this is not desired make sure to set the .type field in JsonDispatch to
+ * JSON_UNSIGNED rather than _JSON_VARIANT_TYPE_INVALID, so that json_dispatch() already filters out
+ * the non-matching type. */
+
+ if (json_variant_is_string(variant) && safe_atou64(json_variant_string(variant), u) >= 0)
+ return 0;
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an unsigned integer, nor one formatted as decimal string.", strna(name));
+
+ *u = json_variant_unsigned(variant);
+ return 0;
+}
+
+int json_dispatch_uint32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint32_t *u = ASSERT_PTR(userdata);
+ uint64_t u64;
+ int r;
+
+ assert(variant);
+
+ r = json_dispatch_uint64(name, variant, flags, &u64);
+ if (r < 0)
+ return r;
+
+ if (u64 > UINT32_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name));
+
+ *u = (uint32_t) u64;
+ return 0;
+}
+
+int json_dispatch_int32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ int32_t *i = ASSERT_PTR(userdata);
+ int64_t i64;
+ int r;
+
+ assert(variant);
+
+ r = json_dispatch_int64(name, variant, flags, &i64);
+ if (r < 0)
+ return r;
+
+ if (i64 < INT32_MIN || i64 > INT32_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name));
+
+ *i = (int32_t) i64;
+ return 0;
+}
+
+int json_dispatch_int16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ int16_t *i = ASSERT_PTR(userdata);
+ int64_t i64;
+ int r;
+
+ assert(variant);
+
+ r = json_dispatch_int64(name, variant, flags, &i64);
+ if (r < 0)
+ return r;
+
+ if (i64 < INT16_MIN || i64 > INT16_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name));
+
+ *i = (int16_t) i64;
+ return 0;
+}
+
+int json_dispatch_uint16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint16_t *u = ASSERT_PTR(userdata);
+ uint64_t u64;
+ int r;
+
+ assert(variant);
+
+ r = json_dispatch_uint64(name, variant, flags, &u64);
+ if (r < 0)
+ return r;
+
+ if (u64 > UINT16_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' out of bounds.", strna(name));
+
+ *u = (uint16_t) u64;
+ return 0;
+}
+
+int json_dispatch_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = ASSERT_PTR(userdata);
+ int r;
+
+ assert(variant);
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(variant)))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name));
+
+ r = free_and_strdup(s, json_variant_string(variant));
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+int json_dispatch_const_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ const char **s = ASSERT_PTR(userdata);
+
+ assert(variant);
+
+ if (json_variant_is_null(variant)) {
+ *s = NULL;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(variant)))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name));
+
+ *s = json_variant_string(variant);
+ return 0;
+}
+
+int json_dispatch_strv(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ _cleanup_strv_free_ char **l = NULL;
+ char ***s = ASSERT_PTR(userdata);
+ JsonVariant *e;
+ int r;
+
+ assert(variant);
+
+ if (json_variant_is_null(variant)) {
+ *s = strv_free(*s);
+ return 0;
+ }
+
+ /* Let's be flexible here: accept a single string in place of a single-item array */
+ if (json_variant_is_string(variant)) {
+ if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(variant)))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name));
+
+ l = strv_new(json_variant_string(variant));
+ if (!l)
+ return log_oom();
+
+ strv_free_and_replace(*s, l);
+ return 0;
+ }
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, SYNTHETIC_ERRNO(EINVAL), flags, "JSON field '%s' is not an array.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ if (!json_variant_is_string(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string.");
+
+ if ((flags & JSON_SAFE) && !string_is_safe(json_variant_string(e)))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' contains unsafe characters, refusing.", strna(name));
+
+ r = strv_extend(&l, json_variant_string(e));
+ if (r < 0)
+ return json_log(e, flags, r, "Failed to append array element: %m");
+ }
+
+ strv_free_and_replace(*s, l);
+ return 0;
+}
+
+int json_dispatch_variant(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ JsonVariant **p = ASSERT_PTR(userdata);
+ assert(variant);
+
+ /* Takes a reference */
+ JSON_VARIANT_REPLACE(*p, json_variant_ref(variant));
+ return 0;
+}
+
+int json_dispatch_variant_noref(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ JsonVariant **p = ASSERT_PTR(userdata);
+ assert(variant);
+
+ /* Doesn't take a reference */
+ *p = variant;
+ return 0;
+}
+
+int json_dispatch_uid_gid(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uid_t *uid = userdata;
+ uint64_t k;
+
+ assert_cc(sizeof(uid_t) == sizeof(uint32_t));
+ assert_cc(sizeof(gid_t) == sizeof(uint32_t));
+
+ DISABLE_WARNING_TYPE_LIMITS;
+ assert_cc((UID_INVALID < (uid_t) 0) == (GID_INVALID < (gid_t) 0));
+ REENABLE_WARNING;
+
+ if (json_variant_is_null(variant)) {
+ *uid = UID_INVALID;
+ return 0;
+ }
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name));
+
+ k = json_variant_unsigned(variant);
+ if (k > UINT32_MAX || !uid_is_valid(k))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid UID/GID.", strna(name));
+
+ *uid = k;
+ return 0;
+}
+
+int json_dispatch_user_group_name(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (!valid_user_group_name(n, FLAGS_SET(flags, JSON_RELAX) ? VALID_USER_RELAX : 0))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid user/group name.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+int json_dispatch_id128(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ sd_id128_t *uuid = userdata;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *uuid = SD_ID128_NULL;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ r = sd_id128_from_string(json_variant_string(variant), uuid);
+ if (r < 0)
+ return json_log(variant, flags, r, "JSON field '%s' is not a valid UID.", strna(name));
+
+ return 0;
+}
+
+int json_dispatch_unsupported(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not allowed in this object.", strna(name));
+}
+
+int json_dispatch_unbase64_iovec(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ _cleanup_free_ void *buffer = NULL;
+ struct iovec *iov = ASSERT_PTR(userdata);
+ size_t sz;
+ int r;
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ r = json_variant_unbase64(variant, &buffer, &sz);
+ if (r < 0)
+ return json_log(variant, flags, r, "JSON field '%s' is not valid Base64 data.", strna(name));
+
+ free_and_replace(iov->iov_base, buffer);
+ iov->iov_len = sz;
+ return 0;
+}
+
+static int json_cmp_strings(const void *x, const void *y) {
+ JsonVariant *const *a = x, *const *b = y;
+
+ if (!json_variant_is_string(*a) || !json_variant_is_string(*b))
+ return CMP(*a, *b);
+
+ return strcmp(json_variant_string(*a), json_variant_string(*b));
+}
+
+int json_variant_sort(JsonVariant **v) {
+ _cleanup_free_ JsonVariant **a = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *n = NULL;
+ size_t m;
+ int r;
+
+ assert(v);
+
+ if (json_variant_is_sorted(*v))
+ return 0;
+
+ if (!json_variant_is_object(*v))
+ return -EMEDIUMTYPE;
+
+ /* Sorts they key/value pairs in an object variant */
+
+ m = json_variant_elements(*v);
+ a = new(JsonVariant*, m);
+ if (!a)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < m; i++)
+ a[i] = json_variant_by_index(*v, i);
+
+ qsort(a, m/2, sizeof(JsonVariant*)*2, json_cmp_strings);
+
+ r = json_variant_new_object(&n, a, m);
+ if (r < 0)
+ return r;
+
+ json_variant_propagate_sensitive(*v, n);
+
+ if (!n->sorted) /* Check if this worked. This will fail if there are multiple identical keys used. */
+ return -ENOTUNIQ;
+
+ JSON_VARIANT_REPLACE(*v, TAKE_PTR(n));
+
+ return 1;
+}
+
+int json_variant_normalize(JsonVariant **v) {
+ _cleanup_free_ JsonVariant **a = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *n = NULL;
+ size_t i, m;
+ int r;
+
+ assert(v);
+
+ if (json_variant_is_normalized(*v))
+ return 0;
+
+ if (!json_variant_is_object(*v) && !json_variant_is_array(*v))
+ return -EMEDIUMTYPE;
+
+ /* Sorts the key/value pairs in an object variant anywhere down the tree in the specified variant */
+
+ m = json_variant_elements(*v);
+ a = new(JsonVariant*, m);
+ if (!a)
+ return -ENOMEM;
+
+ for (i = 0; i < m; ) {
+ a[i] = json_variant_ref(json_variant_by_index(*v, i));
+ i++;
+
+ r = json_variant_normalize(&a[i-1]);
+ if (r < 0)
+ goto finish;
+ }
+
+ qsort(a, m/2, sizeof(JsonVariant*)*2, json_cmp_strings);
+
+ if (json_variant_is_object(*v))
+ r = json_variant_new_object(&n, a, m);
+ else {
+ assert(json_variant_is_array(*v));
+ r = json_variant_new_array(&n, a, m);
+ }
+ if (r < 0)
+ goto finish;
+
+ json_variant_propagate_sensitive(*v, n);
+
+ if (!n->normalized) { /* Let's see if normalization worked. It will fail if there are multiple
+ * identical keys used in the same object anywhere, or if there are floating
+ * point numbers used (see below) */
+ r = -ENOTUNIQ;
+ goto finish;
+ }
+
+ JSON_VARIANT_REPLACE(*v, TAKE_PTR(n));
+
+ r = 1;
+
+finish:
+ for (size_t j = 0; j < i; j++)
+ json_variant_unref(a[j]);
+
+ return r;
+}
+
+bool json_variant_is_normalized(JsonVariant *v) {
+ /* For now, let's consider anything containing numbers not expressible as integers as non-normalized.
+ * That's because we cannot sensibly compare them due to accuracy issues, nor even store them if they
+ * are too large. */
+ if (json_variant_is_real(v) && !json_variant_is_integer(v) && !json_variant_is_unsigned(v))
+ return false;
+
+ /* The concept only applies to variants that include other variants, i.e. objects and arrays. All
+ * others are normalized anyway. */
+ if (!json_variant_is_object(v) && !json_variant_is_array(v))
+ return true;
+
+ /* Empty objects/arrays don't include any other variant, hence are always normalized too */
+ if (json_variant_elements(v) == 0)
+ return true;
+
+ return v->normalized; /* For everything else there's an explicit boolean we maintain */
+}
+
+bool json_variant_is_sorted(JsonVariant *v) {
+
+ /* Returns true if all key/value pairs of an object are properly sorted. Note that this only applies
+ * to objects, not arrays. */
+
+ if (!json_variant_is_object(v))
+ return true;
+ if (json_variant_elements(v) <= 1)
+ return true;
+
+ return v->sorted;
+}
+
+int json_variant_unbase64(JsonVariant *v, void **ret, size_t *ret_size) {
+ if (!json_variant_is_string(v))
+ return -EINVAL;
+
+ return unbase64mem(json_variant_string(v), SIZE_MAX, ret, ret_size);
+}
+
+int json_variant_unhex(JsonVariant *v, void **ret, size_t *ret_size) {
+ if (!json_variant_is_string(v))
+ return -EINVAL;
+
+ return unhexmem(json_variant_string(v), SIZE_MAX, ret, ret_size);
+}
+
+static const char* const json_variant_type_table[_JSON_VARIANT_TYPE_MAX] = {
+ [JSON_VARIANT_STRING] = "string",
+ [JSON_VARIANT_INTEGER] = "integer",
+ [JSON_VARIANT_UNSIGNED] = "unsigned",
+ [JSON_VARIANT_REAL] = "real",
+ [JSON_VARIANT_NUMBER] = "number",
+ [JSON_VARIANT_BOOLEAN] = "boolean",
+ [JSON_VARIANT_ARRAY] = "array",
+ [JSON_VARIANT_OBJECT] = "object",
+ [JSON_VARIANT_NULL] = "null",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(json_variant_type, JsonVariantType);
diff --git a/src/shared/json.h b/src/shared/json.h
new file mode 100644
index 0000000..c40c234
--- /dev/null
+++ b/src/shared/json.h
@@ -0,0 +1,474 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "sd-id128.h"
+
+#include "ether-addr-util.h"
+#include "in-addr-util.h"
+#include "log.h"
+#include "macro.h"
+#include "string-util.h"
+#include "strv.h"
+#include "time-util.h"
+
+/*
+ In case you wonder why we have our own JSON implementation, here are a couple of reasons why this implementation has
+ benefits over various other implementations:
+
+ - We need support for 64-bit signed and unsigned integers, i.e. the full 64,5bit range of -9223372036854775808…18446744073709551615
+ - All our variants are immutable after creation
+ - Special values such as true, false, zero, null, empty strings, empty array, empty objects require zero dynamic memory
+ - Progressive parsing
+ - Our integer/real type implicitly converts, but only if that's safe and loss-lessly possible
+ - There's a "builder" for putting together objects easily in varargs function calls
+ - There's a "dispatcher" for mapping objects to C data structures
+ - Every variant optionally carries parsing location information, which simplifies debugging and parse log error generation
+ - Formatter has color, line, column support
+
+ Limitations:
+ - Doesn't allow embedded NUL in strings
+ - Can't store integers outside of the -9223372036854775808…18446744073709551615 range (it will use 'double' for
+ values outside this range, which is lossy)
+ - Can't store negative zero (will be treated identical to positive zero, and not retained across serialization)
+ - Can't store non-integer numbers that can't be stored in "double" losslessly
+ - Allows creation and parsing of objects with duplicate keys. The "dispatcher" will refuse them however. This means
+ we can parse and pass around such objects, but will carefully refuse them when we convert them into our own data.
+
+ (These limitations should be pretty much in line with those of other JSON implementations, in fact might be less
+ limiting in most cases even.)
+*/
+
+typedef struct JsonVariant JsonVariant;
+
+typedef enum JsonVariantType {
+ JSON_VARIANT_STRING,
+ JSON_VARIANT_INTEGER,
+ JSON_VARIANT_UNSIGNED,
+ JSON_VARIANT_REAL,
+ JSON_VARIANT_NUMBER, /* This a pseudo-type: we can never create variants of this type, but we use it as wildcard check for the above three types */
+ JSON_VARIANT_BOOLEAN,
+ JSON_VARIANT_ARRAY,
+ JSON_VARIANT_OBJECT,
+ JSON_VARIANT_NULL,
+ _JSON_VARIANT_TYPE_MAX,
+ _JSON_VARIANT_TYPE_INVALID = -EINVAL,
+} JsonVariantType;
+
+int json_variant_new_stringn(JsonVariant **ret, const char *s, size_t n);
+int json_variant_new_base64(JsonVariant **ret, const void *p, size_t n);
+int json_variant_new_base32hex(JsonVariant **ret, const void *p, size_t n);
+int json_variant_new_hex(JsonVariant **ret, const void *p, size_t n);
+int json_variant_new_octescape(JsonVariant **ret, const void *p, size_t n);
+int json_variant_new_integer(JsonVariant **ret, int64_t i);
+int json_variant_new_unsigned(JsonVariant **ret, uint64_t u);
+int json_variant_new_real(JsonVariant **ret, double d);
+int json_variant_new_boolean(JsonVariant **ret, bool b);
+int json_variant_new_array(JsonVariant **ret, JsonVariant **array, size_t n);
+int json_variant_new_array_bytes(JsonVariant **ret, const void *p, size_t n);
+int json_variant_new_array_strv(JsonVariant **ret, char **l);
+int json_variant_new_object(JsonVariant **ret, JsonVariant **array, size_t n);
+int json_variant_new_null(JsonVariant **ret);
+int json_variant_new_id128(JsonVariant **ret, sd_id128_t id);
+int json_variant_new_uuid(JsonVariant **ret, sd_id128_t id);
+
+static inline int json_variant_new_string(JsonVariant **ret, const char *s) {
+ return json_variant_new_stringn(ret, s, SIZE_MAX);
+}
+
+JsonVariant *json_variant_ref(JsonVariant *v);
+JsonVariant *json_variant_unref(JsonVariant *v);
+void json_variant_unref_many(JsonVariant **array, size_t n);
+
+#define JSON_VARIANT_REPLACE(v, q) \
+ do { \
+ typeof(v)* _v = &(v); \
+ typeof(q) _q = (q); \
+ json_variant_unref(*_v); \
+ *_v = _q; \
+ } while(0)
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(JsonVariant *, json_variant_unref);
+
+const char *json_variant_string(JsonVariant *v);
+int64_t json_variant_integer(JsonVariant *v);
+uint64_t json_variant_unsigned(JsonVariant *v);
+double json_variant_real(JsonVariant *v);
+bool json_variant_boolean(JsonVariant *v);
+
+JsonVariantType json_variant_type(JsonVariant *v);
+bool json_variant_has_type(JsonVariant *v, JsonVariantType type);
+
+static inline bool json_variant_is_string(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_STRING);
+}
+
+static inline bool json_variant_is_integer(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_INTEGER);
+}
+
+static inline bool json_variant_is_unsigned(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_UNSIGNED);
+}
+
+static inline bool json_variant_is_real(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_REAL);
+}
+
+static inline bool json_variant_is_number(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_NUMBER);
+}
+
+static inline bool json_variant_is_boolean(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_BOOLEAN);
+}
+
+static inline bool json_variant_is_array(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_ARRAY);
+}
+
+static inline bool json_variant_is_object(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_OBJECT);
+}
+
+static inline bool json_variant_is_null(JsonVariant *v) {
+ return json_variant_has_type(v, JSON_VARIANT_NULL);
+}
+
+bool json_variant_is_negative(JsonVariant *v);
+bool json_variant_is_blank_object(JsonVariant *v);
+bool json_variant_is_blank_array(JsonVariant *v);
+bool json_variant_is_normalized(JsonVariant *v);
+bool json_variant_is_sorted(JsonVariant *v);
+
+size_t json_variant_elements(JsonVariant *v);
+JsonVariant *json_variant_by_index(JsonVariant *v, size_t index);
+JsonVariant *json_variant_by_key(JsonVariant *v, const char *key);
+JsonVariant *json_variant_by_key_full(JsonVariant *v, const char *key, JsonVariant **ret_key);
+
+bool json_variant_equal(JsonVariant *a, JsonVariant *b);
+
+void json_variant_sensitive(JsonVariant *v);
+bool json_variant_is_sensitive(JsonVariant *v);
+
+struct json_variant_foreach_state {
+ JsonVariant *variant;
+ size_t idx;
+};
+
+#define _JSON_VARIANT_ARRAY_FOREACH(i, v, state) \
+ for (struct json_variant_foreach_state state = { (v), 0 }; \
+ json_variant_is_array(state.variant) && \
+ state.idx < json_variant_elements(state.variant) && \
+ ({ i = json_variant_by_index(state.variant, state.idx); \
+ true; }); \
+ state.idx++)
+#define JSON_VARIANT_ARRAY_FOREACH(i, v) \
+ _JSON_VARIANT_ARRAY_FOREACH(i, v, UNIQ_T(state, UNIQ))
+
+#define _JSON_VARIANT_OBJECT_FOREACH(k, e, v, state) \
+ for (struct json_variant_foreach_state state = { (v), 0 }; \
+ json_variant_is_object(state.variant) && \
+ state.idx < json_variant_elements(state.variant) && \
+ ({ k = json_variant_string(json_variant_by_index(state.variant, state.idx)); \
+ e = json_variant_by_index(state.variant, state.idx + 1); \
+ true; }); \
+ state.idx += 2)
+#define JSON_VARIANT_OBJECT_FOREACH(k, e, v) \
+ _JSON_VARIANT_OBJECT_FOREACH(k, e, v, UNIQ_T(state, UNIQ))
+
+int json_variant_get_source(JsonVariant *v, const char **ret_source, unsigned *ret_line, unsigned *ret_column);
+
+typedef enum JsonFormatFlags {
+ JSON_FORMAT_NEWLINE = 1 << 0, /* suffix with newline */
+ JSON_FORMAT_PRETTY = 1 << 1, /* add internal whitespace to appeal to human readers */
+ JSON_FORMAT_PRETTY_AUTO = 1 << 2, /* same, but only if connected to a tty (and JSON_FORMAT_NEWLINE otherwise) */
+ JSON_FORMAT_COLOR = 1 << 3, /* insert ANSI color sequences */
+ JSON_FORMAT_COLOR_AUTO = 1 << 4, /* insert ANSI color sequences if colors_enabled() says so */
+ JSON_FORMAT_SOURCE = 1 << 5, /* prefix with source filename/line/column */
+ JSON_FORMAT_SSE = 1 << 6, /* prefix/suffix with W3C server-sent events */
+ JSON_FORMAT_SEQ = 1 << 7, /* prefix/suffix with RFC 7464 application/json-seq */
+ JSON_FORMAT_FLUSH = 1 << 8, /* call fflush() after dumping JSON */
+ JSON_FORMAT_EMPTY_ARRAY = 1 << 9, /* output "[]" for empty input */
+ JSON_FORMAT_OFF = 1 << 10, /* make json_variant_format() fail with -ENOEXEC */
+} JsonFormatFlags;
+
+int json_variant_format(JsonVariant *v, JsonFormatFlags flags, char **ret);
+int json_variant_dump(JsonVariant *v, JsonFormatFlags flags, FILE *f, const char *prefix);
+
+int json_variant_filter(JsonVariant **v, char **to_remove);
+
+int json_variant_set_field(JsonVariant **v, const char *field, JsonVariant *value);
+int json_variant_set_fieldb(JsonVariant **v, const char *field, ...);
+int json_variant_set_field_string(JsonVariant **v, const char *field, const char *value);
+int json_variant_set_field_integer(JsonVariant **v, const char *field, int64_t value);
+int json_variant_set_field_unsigned(JsonVariant **v, const char *field, uint64_t value);
+int json_variant_set_field_boolean(JsonVariant **v, const char *field, bool b);
+int json_variant_set_field_strv(JsonVariant **v, const char *field, char **l);
+
+static inline int json_variant_set_field_non_null(JsonVariant **v, const char *field, JsonVariant *value) {
+ return value && !json_variant_is_null(value) ? json_variant_set_field(v, field, value) : 0;
+}
+
+JsonVariant *json_variant_find(JsonVariant *haystack, JsonVariant *needle);
+
+int json_variant_append_array(JsonVariant **v, JsonVariant *element);
+int json_variant_append_arrayb(JsonVariant **v, ...);
+int json_variant_append_array_nodup(JsonVariant **v, JsonVariant *element);
+
+int json_variant_merge_object(JsonVariant **v, JsonVariant *m);
+int json_variant_merge_objectb(JsonVariant **v, ...);
+
+int json_variant_strv(JsonVariant *v, char ***ret);
+
+int json_variant_sort(JsonVariant **v);
+int json_variant_normalize(JsonVariant **v);
+
+typedef enum JsonParseFlags {
+ JSON_PARSE_SENSITIVE = 1 << 0, /* mark variant as "sensitive", i.e. something containing secret key material or such */
+} JsonParseFlags;
+
+int json_parse_with_source(const char *string, const char *source, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column);
+int json_parse_with_source_continue(const char **p, const char *source, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column);
+
+static inline int json_parse(const char *string, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column) {
+ return json_parse_with_source(string, NULL, flags, ret, ret_line, ret_column);
+}
+static inline int json_parse_continue(const char **p, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column) {
+ return json_parse_with_source_continue(p, NULL, flags, ret, ret_line, ret_column);
+}
+
+int json_parse_file_at(FILE *f, int dir_fd, const char *path, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column);
+
+static inline int json_parse_file(FILE *f, const char *path, JsonParseFlags flags, JsonVariant **ret, unsigned *ret_line, unsigned *ret_column) {
+ return json_parse_file_at(f, AT_FDCWD, path, flags, ret, ret_line, ret_column);
+}
+
+enum {
+ _JSON_BUILD_STRING,
+ _JSON_BUILD_INTEGER,
+ _JSON_BUILD_UNSIGNED,
+ _JSON_BUILD_REAL,
+ _JSON_BUILD_BOOLEAN,
+ _JSON_BUILD_ARRAY_BEGIN,
+ _JSON_BUILD_ARRAY_END,
+ _JSON_BUILD_OBJECT_BEGIN,
+ _JSON_BUILD_OBJECT_END,
+ _JSON_BUILD_PAIR,
+ _JSON_BUILD_PAIR_CONDITION,
+ _JSON_BUILD_NULL,
+ _JSON_BUILD_VARIANT,
+ _JSON_BUILD_VARIANT_ARRAY,
+ _JSON_BUILD_LITERAL,
+ _JSON_BUILD_STRV,
+ _JSON_BUILD_STRV_ENV_PAIR,
+ _JSON_BUILD_BASE64,
+ _JSON_BUILD_IOVEC_BASE64,
+ _JSON_BUILD_BASE32HEX,
+ _JSON_BUILD_HEX,
+ _JSON_BUILD_OCTESCAPE,
+ _JSON_BUILD_ID128,
+ _JSON_BUILD_UUID,
+ _JSON_BUILD_BYTE_ARRAY,
+ _JSON_BUILD_HW_ADDR,
+ _JSON_BUILD_STRING_SET,
+ _JSON_BUILD_CALLBACK,
+ _JSON_BUILD_PAIR_UNSIGNED_NON_ZERO,
+ _JSON_BUILD_PAIR_FINITE_USEC,
+ _JSON_BUILD_PAIR_STRING_NON_EMPTY,
+ _JSON_BUILD_PAIR_STRV_NON_EMPTY,
+ _JSON_BUILD_PAIR_VARIANT_NON_NULL,
+ _JSON_BUILD_PAIR_VARIANT_ARRAY_NON_EMPTY,
+ _JSON_BUILD_PAIR_IN4_ADDR_NON_NULL,
+ _JSON_BUILD_PAIR_IN6_ADDR_NON_NULL,
+ _JSON_BUILD_PAIR_IN_ADDR_NON_NULL,
+ _JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL,
+ _JSON_BUILD_PAIR_HW_ADDR_NON_NULL,
+ _JSON_BUILD_MAX,
+};
+
+typedef int (*JsonBuildCallback)(JsonVariant **ret, const char *name, void *userdata);
+
+#define JSON_BUILD_STRING(s) _JSON_BUILD_STRING, (const char*) { s }
+#define JSON_BUILD_INTEGER(i) _JSON_BUILD_INTEGER, (int64_t) { i }
+#define JSON_BUILD_UNSIGNED(u) _JSON_BUILD_UNSIGNED, (uint64_t) { u }
+#define JSON_BUILD_REAL(d) _JSON_BUILD_REAL, (double) { d }
+#define JSON_BUILD_BOOLEAN(b) _JSON_BUILD_BOOLEAN, (bool) { b }
+#define JSON_BUILD_ARRAY(...) _JSON_BUILD_ARRAY_BEGIN, __VA_ARGS__, _JSON_BUILD_ARRAY_END
+#define JSON_BUILD_EMPTY_ARRAY _JSON_BUILD_ARRAY_BEGIN, _JSON_BUILD_ARRAY_END
+#define JSON_BUILD_OBJECT(...) _JSON_BUILD_OBJECT_BEGIN, __VA_ARGS__, _JSON_BUILD_OBJECT_END
+#define JSON_BUILD_EMPTY_OBJECT _JSON_BUILD_OBJECT_BEGIN, _JSON_BUILD_OBJECT_END
+#define JSON_BUILD_PAIR(n, ...) _JSON_BUILD_PAIR, (const char*) { n }, __VA_ARGS__
+#define JSON_BUILD_PAIR_CONDITION(c, n, ...) _JSON_BUILD_PAIR_CONDITION, (bool) { c }, (const char*) { n }, __VA_ARGS__
+#define JSON_BUILD_NULL _JSON_BUILD_NULL
+#define JSON_BUILD_VARIANT(v) _JSON_BUILD_VARIANT, (JsonVariant*) { v }
+#define JSON_BUILD_VARIANT_ARRAY(v, n) _JSON_BUILD_VARIANT_ARRAY, (JsonVariant **) { v }, (size_t) { n }
+#define JSON_BUILD_LITERAL(l) _JSON_BUILD_LITERAL, (const char*) { l }
+#define JSON_BUILD_STRV(l) _JSON_BUILD_STRV, (char**) { l }
+#define JSON_BUILD_STRV_ENV_PAIR(l) _JSON_BUILD_STRV_ENV_PAIR, (char**) { l }
+#define JSON_BUILD_BASE64(p, n) _JSON_BUILD_BASE64, (const void*) { p }, (size_t) { n }
+#define JSON_BUILD_IOVEC_BASE64(iov) _JSON_BUILD_IOVEC_BASE64, (const struct iovec*) { iov }
+#define JSON_BUILD_BASE32HEX(p, n) _JSON_BUILD_BASE32HEX, (const void*) { p }, (size_t) { n }
+#define JSON_BUILD_HEX(p, n) _JSON_BUILD_HEX, (const void*) { p }, (size_t) { n }
+#define JSON_BUILD_OCTESCAPE(p, n) _JSON_BUILD_OCTESCAPE, (const void*) { p }, (size_t) { n }
+#define JSON_BUILD_ID128(id) _JSON_BUILD_ID128, (const sd_id128_t*) { &(id) }
+#define JSON_BUILD_UUID(id) _JSON_BUILD_UUID, (const sd_id128_t*) { &(id) }
+#define JSON_BUILD_BYTE_ARRAY(v, n) _JSON_BUILD_BYTE_ARRAY, (const void*) { v }, (size_t) { n }
+#define JSON_BUILD_CONST_STRING(s) _JSON_BUILD_VARIANT, JSON_VARIANT_STRING_CONST(s)
+#define JSON_BUILD_IN4_ADDR(v) JSON_BUILD_BYTE_ARRAY((const struct in_addr*) { v }, sizeof(struct in_addr))
+#define JSON_BUILD_IN6_ADDR(v) JSON_BUILD_BYTE_ARRAY((const struct in6_addr*) { v }, sizeof(struct in6_addr))
+#define JSON_BUILD_IN_ADDR(v, f) JSON_BUILD_BYTE_ARRAY(((const union in_addr_union*) { v })->bytes, FAMILY_ADDRESS_SIZE_SAFE(f))
+#define JSON_BUILD_ETHER_ADDR(v) JSON_BUILD_BYTE_ARRAY(((const struct ether_addr*) { v })->ether_addr_octet, sizeof(struct ether_addr))
+#define JSON_BUILD_HW_ADDR(v) _JSON_BUILD_HW_ADDR, (const struct hw_addr_data*) { v }
+#define JSON_BUILD_STRING_SET(s) _JSON_BUILD_STRING_SET, (Set *) { s }
+#define JSON_BUILD_CALLBACK(c, u) _JSON_BUILD_CALLBACK, (JsonBuildCallback) { c }, (void*) { u }
+
+#define JSON_BUILD_PAIR_STRING(name, s) JSON_BUILD_PAIR(name, JSON_BUILD_STRING(s))
+#define JSON_BUILD_PAIR_INTEGER(name, i) JSON_BUILD_PAIR(name, JSON_BUILD_INTEGER(i))
+#define JSON_BUILD_PAIR_UNSIGNED(name, u) JSON_BUILD_PAIR(name, JSON_BUILD_UNSIGNED(u))
+#define JSON_BUILD_PAIR_REAL(name, d) JSON_BUILD_PAIR(name, JSON_BUILD_REAL(d))
+#define JSON_BUILD_PAIR_BOOLEAN(name, b) JSON_BUILD_PAIR(name, JSON_BUILD_BOOLEAN(b))
+#define JSON_BUILD_PAIR_ARRAY(name, ...) JSON_BUILD_PAIR(name, JSON_BUILD_ARRAY(__VA_ARGS__))
+#define JSON_BUILD_PAIR_EMPTY_ARRAY(name) JSON_BUILD_PAIR(name, JSON_BUILD_EMPTY_ARRAY)
+#define JSON_BUILD_PAIR_OBJECT(name, ...) JSON_BUILD_PAIR(name, JSON_BUILD_OBJECT(__VA_ARGS__))
+#define JSON_BUILD_PAIR_EMPTY_OBJECT(name) JSON_BUILD_PAIR(name, JSON_BUILD_EMPTY_OBJECT)
+#define JSON_BUILD_PAIR_NULL(name) JSON_BUILD_PAIR(name, JSON_BUILD_NULL)
+#define JSON_BUILD_PAIR_VARIANT(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_VARIANT(v))
+#define JSON_BUILD_PAIR_VARIANT_ARRAY(name, v, n) JSON_BUILD_PAIR(name, JSON_BUILD_VARIANT_ARRAY(v, n))
+#define JSON_BUILD_PAIR_LITERAL(name, l) JSON_BUILD_PAIR(name, JSON_BUILD_LITERAL(l))
+#define JSON_BUILD_PAIR_STRV(name, l) JSON_BUILD_PAIR(name, JSON_BUILD_STRV(l))
+#define JSON_BUILD_PAIR_BASE64(name, p, n) JSON_BUILD_PAIR(name, JSON_BUILD_BASE64(p, n))
+#define JSON_BUILD_PAIR_IOVEC_BASE64(name, iov) JSON_BUILD_PAIR(name, JSON_BUILD_IOVEC_BASE64(iov))
+#define JSON_BUILD_PAIR_HEX(name, p, n) JSON_BUILD_PAIR(name, JSON_BUILD_HEX(p, n))
+#define JSON_BUILD_PAIR_ID128(name, id) JSON_BUILD_PAIR(name, JSON_BUILD_ID128(id))
+#define JSON_BUILD_PAIR_UUID(name, id) JSON_BUILD_PAIR(name, JSON_BUILD_UUID(id))
+#define JSON_BUILD_PAIR_BYTE_ARRAY(name, v, n) JSON_BUILD_PAIR(name, JSON_BUILD_BYTE_ARRAY(v, n))
+#define JSON_BUILD_PAIR_IN4_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_IN4_ADDR(v))
+#define JSON_BUILD_PAIR_IN6_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_IN6_ADDR(v))
+#define JSON_BUILD_PAIR_IN_ADDR(name, v, f) JSON_BUILD_PAIR(name, JSON_BUILD_IN_ADDR(v, f))
+#define JSON_BUILD_PAIR_ETHER_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_ETHER_ADDR(v))
+#define JSON_BUILD_PAIR_HW_ADDR(name, v) JSON_BUILD_PAIR(name, JSON_BUILD_HW_ADDR(v))
+#define JSON_BUILD_PAIR_STRING_SET(name, s) JSON_BUILD_PAIR(name, JSON_BUILD_STRING_SET(s))
+#define JSON_BUILD_PAIR_CALLBACK(name, c, u) JSON_BUILD_PAIR(name, JSON_BUILD_CALLBACK(c, u))
+
+#define JSON_BUILD_PAIR_UNSIGNED_NON_ZERO(name, u) _JSON_BUILD_PAIR_UNSIGNED_NON_ZERO, (const char*) { name }, (uint64_t) { u }
+#define JSON_BUILD_PAIR_FINITE_USEC(name, u) _JSON_BUILD_PAIR_FINITE_USEC, (const char*) { name }, (usec_t) { u }
+#define JSON_BUILD_PAIR_STRING_NON_EMPTY(name, s) _JSON_BUILD_PAIR_STRING_NON_EMPTY, (const char*) { name }, (const char*) { s }
+#define JSON_BUILD_PAIR_STRV_NON_EMPTY(name, l) _JSON_BUILD_PAIR_STRV_NON_EMPTY, (const char*) { name }, (char**) { l }
+#define JSON_BUILD_PAIR_VARIANT_NON_NULL(name, v) _JSON_BUILD_PAIR_VARIANT_NON_NULL, (const char*) { name }, (JsonVariant*) { v }
+#define JSON_BUILD_PAIR_IN4_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_IN4_ADDR_NON_NULL, (const char*) { name }, (const struct in_addr*) { v }
+#define JSON_BUILD_PAIR_IN6_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_IN6_ADDR_NON_NULL, (const char*) { name }, (const struct in6_addr*) { v }
+#define JSON_BUILD_PAIR_IN_ADDR_NON_NULL(name, v, f) _JSON_BUILD_PAIR_IN_ADDR_NON_NULL, (const char*) { name }, (const union in_addr_union*) { v }, (int) { f }
+#define JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_ETHER_ADDR_NON_NULL, (const char*) { name }, (const struct ether_addr*) { v }
+#define JSON_BUILD_PAIR_HW_ADDR_NON_NULL(name, v) _JSON_BUILD_PAIR_HW_ADDR_NON_NULL, (const char*) { name }, (const struct hw_addr_data*) { v }
+
+int json_build(JsonVariant **ret, ...);
+int json_buildv(JsonVariant **ret, va_list ap);
+
+/* A bitmask of flags used by the dispatch logic. Note that this is a combined bit mask, that is generated from the bit
+ * mask originally passed into json_dispatch(), the individual bitmask associated with the static JsonDispatch callout
+ * entry, as well the bitmask specified for json_log() calls */
+typedef enum JsonDispatchFlags {
+ /* The following three may be set in JsonDispatch's .flags field or the json_dispatch() flags parameter */
+ JSON_PERMISSIVE = 1 << 0, /* Shall parsing errors be considered fatal for this property? */
+ JSON_MANDATORY = 1 << 1, /* Should existence of this property be mandatory? */
+ JSON_LOG = 1 << 2, /* Should the parser log about errors? */
+ JSON_SAFE = 1 << 3, /* Don't accept "unsafe" strings in json_dispatch_string() + json_dispatch_string() */
+ JSON_RELAX = 1 << 4, /* Use relaxed user name checking in json_dispatch_user_group_name */
+
+ /* The following two may be passed into log_json() in addition to those above */
+ JSON_DEBUG = 1 << 5, /* Indicates that this log message is a debug message */
+ JSON_WARNING = 1 << 6, /* Indicates that this log message is a warning message */
+} JsonDispatchFlags;
+
+typedef int (*JsonDispatchCallback)(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+
+typedef struct JsonDispatch {
+ const char *name;
+ JsonVariantType type;
+ JsonDispatchCallback callback;
+ size_t offset;
+ JsonDispatchFlags flags;
+} JsonDispatch;
+
+int json_dispatch_full(JsonVariant *v, const JsonDispatch table[], JsonDispatchCallback bad, JsonDispatchFlags flags, void *userdata, const char **reterr_bad_field);
+
+static inline int json_dispatch(JsonVariant *v, const JsonDispatch table[], JsonDispatchFlags flags, void *userdata) {
+ return json_dispatch_full(v, table, NULL, flags, userdata, NULL);
+}
+
+int json_dispatch_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_const_string(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_strv(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_boolean(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_tristate(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_variant(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_variant_noref(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_int64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_uint64(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_uint32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_int32(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_uint16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_int16(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_uid_gid(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_user_group_name(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_id128(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_unsupported(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_unbase64_iovec(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+
+assert_cc(sizeof(uint32_t) == sizeof(unsigned));
+#define json_dispatch_uint json_dispatch_uint32
+
+assert_cc(sizeof(int32_t) == sizeof(int));
+#define json_dispatch_int json_dispatch_int32
+
+static inline int json_dispatch_level(JsonDispatchFlags flags) {
+
+ /* Did the user request no logging? If so, then never log higher than LOG_DEBUG. Also, if this is marked as
+ * debug message, then also log at debug level. */
+
+ if (!(flags & JSON_LOG) ||
+ (flags & JSON_DEBUG))
+ return LOG_DEBUG;
+
+ /* Are we invoked in permissive mode, or is this explicitly marked as warning message? Then this should be
+ * printed at LOG_WARNING */
+ if (flags & (JSON_PERMISSIVE|JSON_WARNING))
+ return LOG_WARNING;
+
+ /* Otherwise it's an error. */
+ return LOG_ERR;
+}
+
+int json_log_internal(JsonVariant *variant, int level, int error, const char *file, int line, const char *func, const char *format, ...) _printf_(7, 8);
+
+#define json_log(variant, flags, error, ...) \
+ ({ \
+ int _level = json_dispatch_level(flags), _e = (error); \
+ (log_get_max_level() >= LOG_PRI(_level)) \
+ ? json_log_internal(variant, _level, _e, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__) \
+ : -ERRNO_VALUE(_e); \
+ })
+
+#define json_log_oom(variant, flags) \
+ json_log(variant, flags, SYNTHETIC_ERRNO(ENOMEM), "Out of memory.")
+
+#define JSON_VARIANT_STRING_CONST(x) _JSON_VARIANT_STRING_CONST(UNIQ, (x))
+
+#define _JSON_VARIANT_STRING_CONST(xq, x) \
+ ({ \
+ _align_(2) static const char UNIQ_T(json_string_const, xq)[] = (x); \
+ assert((((uintptr_t) UNIQ_T(json_string_const, xq)) & 1) == 0); \
+ (JsonVariant*) ((uintptr_t) UNIQ_T(json_string_const, xq) + 1); \
+ })
+
+int json_variant_unbase64(JsonVariant *v, void **ret, size_t *ret_size);
+int json_variant_unhex(JsonVariant *v, void **ret, size_t *ret_size);
+
+const char *json_variant_type_to_string(JsonVariantType t);
+JsonVariantType json_variant_type_from_string(const char *s);
diff --git a/src/shared/kbd-util.c b/src/shared/kbd-util.c
new file mode 100644
index 0000000..2f2d161
--- /dev/null
+++ b/src/shared/kbd-util.c
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "errno-util.h"
+#include "kbd-util.h"
+#include "log.h"
+#include "nulstr-util.h"
+#include "path-util.h"
+#include "recurse-dir.h"
+#include "set.h"
+#include "string-util.h"
+#include "strv.h"
+#include "utf8.h"
+
+struct recurse_dir_userdata {
+ const char *keymap_name;
+ Set *keymaps;
+};
+
+static int keymap_recurse_dir_callback(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ struct recurse_dir_userdata *data = userdata;
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ assert(de);
+
+ /* If 'keymap_name' is non-NULL, return true if keymap 'keymap_name' is found. Otherwise, add all
+ * keymaps to 'keymaps'. */
+
+ if (event != RECURSE_DIR_ENTRY)
+ return RECURSE_DIR_CONTINUE;
+
+ if (!IN_SET(de->d_type, DT_REG, DT_LNK))
+ return RECURSE_DIR_CONTINUE;
+
+ const char *e = endswith(de->d_name, ".map") ?: endswith(de->d_name, ".map.gz");
+ if (!e)
+ return RECURSE_DIR_CONTINUE;
+
+ p = strndup(de->d_name, e - de->d_name);
+ if (!p)
+ return -ENOMEM;
+
+ if (data->keymap_name)
+ return streq(p, data->keymap_name) ? 1 : RECURSE_DIR_CONTINUE;
+
+ assert(data->keymaps);
+
+ if (!keymap_is_valid(p))
+ return 0;
+
+ r = set_consume(data->keymaps, TAKE_PTR(p));
+ if (r < 0)
+ return r;
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+int get_keymaps(char ***ret) {
+ _cleanup_set_free_free_ Set *keymaps = NULL;
+ int r;
+
+ keymaps = set_new(&string_hash_ops);
+ if (!keymaps)
+ return -ENOMEM;
+
+ NULSTR_FOREACH(dir, KBD_KEYMAP_DIRS) {
+ r = recurse_dir_at(
+ AT_FDCWD,
+ dir,
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
+ keymap_recurse_dir_callback,
+ &(struct recurse_dir_userdata) {
+ .keymaps = keymaps,
+ });
+ if (r == -ENOENT)
+ continue;
+ if (ERRNO_IS_NEG_RESOURCE(r))
+ return log_warning_errno(r, "Failed to read keymap list from %s: %m", dir);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read keymap list from %s, ignoring: %m", dir);
+ }
+
+ _cleanup_strv_free_ char **l = set_get_strv(keymaps);
+ if (!l)
+ return -ENOMEM;
+
+ keymaps = set_free(keymaps); /* If we got the strv above, then do a set_free() rather than
+ * set_free_free() since the entries of the set are now owned by the
+ * strv */
+
+ if (strv_isempty(l))
+ return -ENOENT;
+
+ strv_sort(l);
+
+ *ret = TAKE_PTR(l);
+ return 0;
+}
+
+bool keymap_is_valid(const char *name) {
+ if (isempty(name))
+ return false;
+
+ if (strlen(name) >= 128)
+ return false;
+
+ if (!utf8_is_valid(name))
+ return false;
+
+ if (!filename_is_valid(name))
+ return false;
+
+ if (!string_is_safe(name))
+ return false;
+
+ return true;
+}
+
+int keymap_exists(const char *name) {
+ int r;
+
+ if (!keymap_is_valid(name))
+ return -EINVAL;
+
+ NULSTR_FOREACH(dir, KBD_KEYMAP_DIRS) {
+ r = recurse_dir_at(
+ AT_FDCWD,
+ dir,
+ /* statx_mask= */ 0,
+ /* n_depth_max= */ UINT_MAX,
+ RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
+ keymap_recurse_dir_callback,
+ &(struct recurse_dir_userdata) {
+ .keymap_name = name,
+ });
+ if (r > 0)
+ return true;
+ if (ERRNO_IS_NEG_RESOURCE(r))
+ return r;
+ if (r < 0 && r != -ENOENT)
+ log_debug_errno(r, "Failed to read keymap list from %s, ignoring: %m", dir);
+ }
+
+ return false;
+}
diff --git a/src/shared/kbd-util.h b/src/shared/kbd-util.h
new file mode 100644
index 0000000..aca0dee
--- /dev/null
+++ b/src/shared/kbd-util.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#define KBD_KEYMAP_DIRS \
+ "/usr/share/keymaps/\0" \
+ "/usr/share/kbd/keymaps/\0" \
+ "/usr/lib/kbd/keymaps/\0"
+
+int get_keymaps(char ***l);
+bool keymap_is_valid(const char *name);
+int keymap_exists(const char *name);
diff --git a/src/shared/kernel-image.c b/src/shared/kernel-image.c
new file mode 100644
index 0000000..7dc9e01
--- /dev/null
+++ b/src/shared/kernel-image.c
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "fd-util.h"
+#include "fileio.h"
+#include "env-file.h"
+#include "kernel-image.h"
+#include "os-util.h"
+#include "parse-util.h"
+#include "pe-binary.h"
+#include "string-table.h"
+
+#define PE_SECTION_READ_MAX (16U*1024U)
+
+static const char * const kernel_image_type_table[_KERNEL_IMAGE_TYPE_MAX] = {
+ [KERNEL_IMAGE_TYPE_UNKNOWN] = "unknown",
+ [KERNEL_IMAGE_TYPE_UKI] = "uki",
+ [KERNEL_IMAGE_TYPE_PE] = "pe",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(kernel_image_type, KernelImageType);
+
+static int uki_read_pretty_name(
+ int fd,
+ const PeHeader *pe_header,
+ const IMAGE_SECTION_HEADER *sections,
+ char **ret) {
+
+ _cleanup_free_ char *pname = NULL, *name = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ void *osrel = NULL;
+ size_t osrel_size;
+ int r;
+
+ assert(fd >= 0);
+ assert(pe_header);
+ assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0);
+ assert(ret);
+
+ r = pe_read_section_data(
+ fd,
+ pe_header,
+ sections,
+ ".osrel",
+ /* max_size=*/ PE_SECTION_READ_MAX,
+ &osrel,
+ &osrel_size);
+ if (r == -ENXIO) { /* Section not found */
+ *ret = NULL;
+ return 0;
+ }
+
+ f = fmemopen(osrel, osrel_size, "r");
+ if (!f)
+ return log_error_errno(errno, "Failed to open embedded os-release file: %m");
+
+ r = parse_env_file(
+ f, NULL,
+ "PRETTY_NAME", &pname,
+ "NAME", &name);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse embedded os-release file: %m");
+
+ /* follow the same logic as os_release_pretty_name() */
+ if (!isempty(pname))
+ *ret = TAKE_PTR(pname);
+ else if (!isempty(name))
+ *ret = TAKE_PTR(name);
+ else {
+ char *n = strdup("Linux");
+ if (!n)
+ return log_oom();
+
+ *ret = n;
+ }
+
+ return 0;
+}
+
+static int inspect_uki(
+ int fd,
+ const PeHeader *pe_header,
+ const IMAGE_SECTION_HEADER *sections,
+ char **ret_cmdline,
+ char **ret_uname,
+ char **ret_pretty_name) {
+
+ _cleanup_free_ char *cmdline = NULL, *uname = NULL, *pname = NULL;
+ int r;
+
+ assert(fd >= 0);
+ assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0);
+
+ if (ret_cmdline) {
+ r = pe_read_section_data(fd, pe_header, sections, ".cmdline", PE_SECTION_READ_MAX, (void**) &cmdline, NULL);
+ if (r < 0 && r != -ENXIO) /* If the section doesn't exist, that's fine */
+ return r;
+ }
+
+ if (ret_uname) {
+ r = pe_read_section_data(fd, pe_header, sections, ".uname", PE_SECTION_READ_MAX, (void**) &uname, NULL);
+ if (r < 0 && r != -ENXIO) /* If the section doesn't exist, that's fine */
+ return r;
+ }
+
+ if (ret_pretty_name) {
+ r = uki_read_pretty_name(fd, pe_header, sections, &pname);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_cmdline)
+ *ret_cmdline = TAKE_PTR(cmdline);
+ if (ret_uname)
+ *ret_uname = TAKE_PTR(uname);
+ if (ret_pretty_name)
+ *ret_pretty_name = TAKE_PTR(pname);
+
+ return 0;
+}
+
+int inspect_kernel(
+ int dir_fd,
+ const char *filename,
+ KernelImageType *ret_type,
+ char **ret_cmdline,
+ char **ret_uname,
+ char **ret_pretty_name) {
+
+ _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL;
+ _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL;
+ KernelImageType t = KERNEL_IMAGE_TYPE_UNKNOWN;
+ _cleanup_free_ PeHeader *pe_header = NULL;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(filename);
+
+ fd = openat(dir_fd, filename, O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open kernel image file '%s': %m", filename);
+
+ r = pe_load_headers(fd, &dos_header, &pe_header);
+ if (r == -EBADMSG) /* not a valid PE file */
+ goto not_uki;
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse kernel image file '%s': %m", filename);
+
+ r = pe_load_sections(fd, dos_header, pe_header, &sections);
+ if (r == -EBADMSG) /* not a valid PE file */
+ goto not_uki;
+ if (r < 0)
+ return log_error_errno(r, "Failed to load PE sections from kernel image file '%s': %m", filename);
+
+ if (pe_is_uki(pe_header, sections)) {
+ r = inspect_uki(fd, pe_header, sections, ret_cmdline, ret_uname, ret_pretty_name);
+ if (r < 0)
+ return r;
+
+ t = KERNEL_IMAGE_TYPE_UKI;
+ goto done;
+ } else
+ t = KERNEL_IMAGE_TYPE_PE;
+
+not_uki:
+ if (ret_cmdline)
+ *ret_cmdline = NULL;
+ if (ret_uname)
+ *ret_uname = NULL;
+ if (ret_pretty_name)
+ *ret_pretty_name = NULL;
+
+done:
+ if (ret_type)
+ *ret_type = t;
+
+ return 0;
+}
diff --git a/src/shared/kernel-image.h b/src/shared/kernel-image.h
new file mode 100644
index 0000000..41b2c08
--- /dev/null
+++ b/src/shared/kernel-image.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+
+#include "macro.h"
+
+typedef enum KernelImageType {
+ KERNEL_IMAGE_TYPE_UNKNOWN,
+ KERNEL_IMAGE_TYPE_UKI,
+ KERNEL_IMAGE_TYPE_PE,
+ _KERNEL_IMAGE_TYPE_MAX,
+ _KERNEL_IMAGE_TYPE_INVALID = -EINVAL,
+} KernelImageType;
+
+const char* kernel_image_type_to_string(KernelImageType t) _const_;
+
+int inspect_kernel(
+ int dir_fd,
+ const char *filename,
+ KernelImageType *ret_type,
+ char **ret_cmdline,
+ char **ret_uname,
+ char **ret_pretty_name);
diff --git a/src/shared/keyring-util.c b/src/shared/keyring-util.c
new file mode 100644
index 0000000..fadd90e
--- /dev/null
+++ b/src/shared/keyring-util.c
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "keyring-util.h"
+#include "memory-util.h"
+#include "missing_syscall.h"
+
+int keyring_read(key_serial_t serial, void **ret, size_t *ret_size) {
+ size_t bufsize = 100;
+
+ for (;;) {
+ _cleanup_(erase_and_freep) uint8_t *buf = NULL;
+ long n;
+
+ buf = new(uint8_t, bufsize + 1);
+ if (!buf)
+ return -ENOMEM;
+
+ n = keyctl(KEYCTL_READ, (unsigned long) serial, (unsigned long) buf, (unsigned long) bufsize, 0);
+ if (n < 0)
+ return -errno;
+
+ if ((size_t) n <= bufsize) {
+ buf[n] = 0; /* NUL terminate, just in case */
+
+ if (ret)
+ *ret = TAKE_PTR(buf);
+ if (ret_size)
+ *ret_size = n;
+
+ return 0;
+ }
+
+ bufsize = (size_t) n;
+ }
+}
diff --git a/src/shared/keyring-util.h b/src/shared/keyring-util.h
new file mode 100644
index 0000000..c8c53f1
--- /dev/null
+++ b/src/shared/keyring-util.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "missing_keyctl.h"
+
+/* Like TAKE_PTR() but for key_serial_t, resetting them to -1 */
+#define TAKE_KEY_SERIAL(key_serial) TAKE_GENERIC(key_serial, key_serial_t, -1)
+
+int keyring_read(key_serial_t serial, void **ret, size_t *ret_size);
diff --git a/src/shared/killall.c b/src/shared/killall.c
new file mode 100644
index 0000000..917b773
--- /dev/null
+++ b/src/shared/killall.c
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2010 ProFUSION embedded systems
+***/
+
+#include <errno.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "constants.h"
+#include "dirent-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "initrd-util.h"
+#include "killall.h"
+#include "parse-util.h"
+#include "process-util.h"
+#include "set.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+
+static bool argv_has_at(pid_t pid) {
+ _cleanup_fclose_ FILE *f = NULL;
+ const char *p;
+ char c = 0;
+
+ p = procfs_file_alloca(pid, "cmdline");
+ f = fopen(p, "re");
+ if (!f) {
+ log_debug_errno(errno, "Failed to open %s, ignoring: %m", p);
+ return true; /* not really, but has the desired effect */
+ }
+
+ /* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
+ * kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
+ * actual kernel threads are already filtered out above. */
+ (void) fread(&c, 1, 1, f);
+
+ /* Processes with argv[0][0] = '@' we ignore from the killing spree.
+ *
+ * https://systemd.io/ROOT_STORAGE_DAEMONS */
+ return c == '@';
+}
+
+static bool is_survivor_cgroup(const PidRef *pid) {
+ _cleanup_free_ char *cgroup_path = NULL;
+ int r;
+
+ assert(pidref_is_set(pid));
+
+ r = cg_pidref_get_path(/* root= */ NULL, pid, &cgroup_path);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid->pid);
+ return false;
+ }
+
+ r = cg_get_xattr_bool(cgroup_path, "user.survive_final_kill_signal");
+ /* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */
+ if (ERRNO_IS_NEG_XATTR_ABSENT(r))
+ r = cg_get_xattr_bool(cgroup_path, "trusted.survive_final_kill_signal");
+ if (r < 0 && !ERRNO_IS_NEG_XATTR_ABSENT(r))
+ log_debug_errno(r,
+ "Failed to get survive_final_kill_signal xattr of %s, ignoring: %m",
+ cgroup_path);
+
+ return r > 0;
+}
+
+static bool ignore_proc(const PidRef *pid, bool warn_rootfs) {
+ uid_t uid;
+ int r;
+
+ assert(pidref_is_set(pid));
+
+ /* We are PID 1, let's not commit suicide */
+ if (pid->pid == 1)
+ return true;
+
+ /* Ignore kernel threads */
+ r = pidref_is_kernel_thread(pid);
+ if (r != 0)
+ return true; /* also ignore processes where we can't determine this */
+
+ /* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */
+ if (is_survivor_cgroup(pid))
+ return true;
+
+ r = pidref_get_uid(pid, &uid);
+ if (r < 0)
+ return true; /* not really, but better safe than sorry */
+
+ /* Non-root processes otherwise are always subject to be killed */
+ if (uid != 0)
+ return false;
+
+ if (!argv_has_at(pid->pid))
+ return false;
+
+ if (warn_rootfs &&
+ pid_from_same_root_fs(pid->pid) > 0) {
+
+ _cleanup_free_ char *comm = NULL;
+
+ (void) pidref_get_comm(pid, &comm);
+
+ log_notice("Process " PID_FMT " (%s) has been marked to be excluded from killing. It is "
+ "running from the root file system, and thus likely to block re-mounting of the "
+ "root file system to read-only. Please consider moving it into an initrd file "
+ "system instead.", pid->pid, strna(comm));
+ }
+
+ return true;
+}
+
+static void log_children_no_yet_killed(Set *pids) {
+ _cleanup_free_ char *lst_child = NULL;
+ void *p;
+ int r;
+
+ SET_FOREACH(p, pids) {
+ _cleanup_free_ char *s = NULL;
+
+ if (pid_get_comm(PTR_TO_PID(p), &s) >= 0)
+ r = strextendf(&lst_child, ", " PID_FMT " (%s)", PTR_TO_PID(p), s);
+ else
+ r = strextendf(&lst_child, ", " PID_FMT, PTR_TO_PID(p));
+ if (r < 0)
+ return (void) log_oom_warning();
+ }
+
+ if (isempty(lst_child))
+ return;
+
+ log_warning("Waiting for process: %s", lst_child + 2);
+}
+
+static int wait_for_children(Set *pids, sigset_t *mask, usec_t timeout) {
+ usec_t until, date_log_child, n;
+
+ assert(mask);
+
+ /* Return the number of children remaining in the pids set: That correspond to the number
+ * of processes still "alive" after the timeout */
+
+ if (set_isempty(pids))
+ return 0;
+
+ n = now(CLOCK_MONOTONIC);
+ until = usec_add(n, timeout);
+ date_log_child = usec_add(n, 10u * USEC_PER_SEC);
+ if (date_log_child > until)
+ date_log_child = usec_add(n, timeout / 2u);
+
+ for (;;) {
+ struct timespec ts;
+ int k;
+ void *p;
+
+ /* First, let the kernel inform us about killed
+ * children. Most processes will probably be our
+ * children, but some are not (might be our
+ * grandchildren instead...). */
+ for (;;) {
+ pid_t pid;
+
+ pid = waitpid(-1, NULL, WNOHANG);
+ if (pid == 0)
+ break;
+ if (pid < 0) {
+ if (errno == ECHILD)
+ break;
+
+ return log_error_errno(errno, "waitpid() failed: %m");
+ }
+
+ (void) set_remove(pids, PID_TO_PTR(pid));
+ }
+
+ /* Now explicitly check who might be remaining, who
+ * might not be our child. */
+ SET_FOREACH(p, pids) {
+
+ /* kill(pid, 0) sends no signal, but it tells
+ * us whether the process still exists. */
+ if (kill(PTR_TO_PID(p), 0) == 0)
+ continue;
+
+ if (errno != ESRCH)
+ continue;
+
+ set_remove(pids, p);
+ }
+
+ if (set_isempty(pids))
+ return 0;
+
+ n = now(CLOCK_MONOTONIC);
+ if (date_log_child > 0 && n >= date_log_child) {
+ log_children_no_yet_killed(pids);
+ /* Log the children not yet killed only once */
+ date_log_child = 0;
+ }
+
+ if (n >= until)
+ return set_size(pids);
+
+ if (date_log_child > 0)
+ timespec_store(&ts, MIN(until - n, date_log_child - n));
+ else
+ timespec_store(&ts, until - n);
+
+ k = sigtimedwait(mask, NULL, &ts);
+ if (k != SIGCHLD) {
+
+ if (k < 0 && errno != EAGAIN)
+ return log_error_errno(errno, "sigtimedwait() failed: %m");
+
+ if (k >= 0)
+ log_warning("sigtimedwait() returned unexpected signal.");
+ }
+ }
+}
+
+static int killall(int sig, Set *pids, bool send_sighup) {
+ _cleanup_closedir_ DIR *dir = NULL;
+ int n_killed = 0, r;
+
+ /* Send the specified signal to all remaining processes, if not excluded by ignore_proc().
+ * Returns the number of processes to which the specified signal was sent */
+
+ r = proc_dir_open(&dir);
+ if (r < 0)
+ return log_warning_errno(r, "opendir(/proc) failed: %m");
+
+ for (;;) {
+ _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
+
+ r = proc_dir_read_pidref(dir, &pidref);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enumerate /proc: %m");
+ if (r == 0)
+ break;
+
+ if (ignore_proc(&pidref, sig == SIGKILL && !in_initrd()))
+ continue;
+
+ if (sig == SIGKILL) {
+ _cleanup_free_ char *s = NULL;
+
+ (void) pidref_get_comm(&pidref, &s);
+ log_notice("Sending SIGKILL to PID "PID_FMT" (%s).", pidref.pid, strna(s));
+ }
+
+ r = pidref_kill(&pidref, sig);
+ if (r < 0) {
+ if (r != -ESRCH)
+ log_warning_errno(errno, "Could not kill " PID_FMT ", ignoring: %m", pidref.pid);
+ } else {
+ n_killed++;
+ if (pids) {
+ r = set_put(pids, PID_TO_PTR(pidref.pid));
+ if (r < 0)
+ (void) log_oom_warning();
+ }
+ }
+
+ if (send_sighup) {
+ /* Optionally, also send a SIGHUP signal, but only if the process has a controlling
+ * tty. This is useful to allow handling of shells which ignore SIGTERM but react to
+ * SIGHUP. We do not send this to processes that have no controlling TTY since we
+ * don't want to trigger reloads of daemon processes. Also we make sure to only send
+ * this after SIGTERM so that SIGTERM is always first in the queue. */
+
+ if (get_ctty_devnr(pidref.pid, NULL) >= 0)
+ /* it's OK if the process is gone, just ignore the result */
+ (void) pidref_kill(&pidref, SIGHUP);
+ }
+ }
+
+ return n_killed;
+}
+
+int broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout) {
+ int n_children_left;
+ sigset_t mask, oldmask;
+ _cleanup_set_free_ Set *pids = NULL;
+
+ /* Send the specified signal to all remaining processes, if not excluded by ignore_proc().
+ * Return:
+ * - The number of processes still "alive" after the timeout (that should have been killed)
+ * if the function needs to wait for the end of the processes (wait_for_exit).
+ * - Otherwise, the number of processes to which the specified signal was sent */
+
+ if (wait_for_exit)
+ pids = set_new(NULL);
+
+ assert_se(sigemptyset(&mask) == 0);
+ assert_se(sigaddset(&mask, SIGCHLD) == 0);
+ assert_se(sigprocmask(SIG_BLOCK, &mask, &oldmask) == 0);
+
+ if (kill(-1, SIGSTOP) < 0 && errno != ESRCH)
+ log_warning_errno(errno, "kill(-1, SIGSTOP) failed: %m");
+
+ n_children_left = killall(sig, pids, send_sighup);
+
+ if (kill(-1, SIGCONT) < 0 && errno != ESRCH)
+ log_warning_errno(errno, "kill(-1, SIGCONT) failed: %m");
+
+ if (wait_for_exit && n_children_left > 0)
+ n_children_left = wait_for_children(pids, &mask, timeout);
+
+ assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) == 0);
+
+ return n_children_left;
+}
diff --git a/src/shared/killall.h b/src/shared/killall.h
new file mode 100644
index 0000000..d8ef96f
--- /dev/null
+++ b/src/shared/killall.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "time-util.h"
+
+int broadcast_signal(int sig, bool wait_for_exit, bool send_sighup, usec_t timeout);
diff --git a/src/shared/label-util.c b/src/shared/label-util.c
new file mode 100644
index 0000000..308fbff
--- /dev/null
+++ b/src/shared/label-util.c
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "btrfs-util.h"
+#include "fs-util.h"
+#include "label.h"
+#include "label-util.h"
+#include "macro.h"
+#include "selinux-util.h"
+#include "smack-util.h"
+
+int label_fix_full(
+ int atfd,
+ const char *inode_path, /* path of inode to apply label to */
+ const char *label_path, /* path to use as database lookup key in label database (typically same as inode_path, but not always) */
+ LabelFixFlags flags) {
+
+ int r, q;
+
+ if (atfd < 0 && atfd != AT_FDCWD)
+ return -EBADF;
+
+ if (!inode_path && atfd < 0) /* We need at least one of atfd and an inode path */
+ return -EINVAL;
+
+ /* If both atfd and inode_path are specified, we take the specified path relative to atfd which must be an fd to a dir.
+ *
+ * If only atfd is specified (and inode_path is NULL), we'll operated on the inode the atfd refers to.
+ *
+ * If atfd is AT_FDCWD then we'll operate on the inode the path refers to.
+ */
+
+ r = mac_selinux_fix_full(atfd, inode_path, label_path, flags);
+ q = mac_smack_fix_full(atfd, inode_path, label_path, flags);
+ if (r < 0)
+ return r;
+ if (q < 0)
+ return q;
+
+ return 0;
+}
+
+int symlink_label(const char *old_path, const char *new_path) {
+ int r;
+
+ assert(old_path);
+ assert(new_path);
+
+ r = mac_selinux_create_file_prepare(new_path, S_IFLNK);
+ if (r < 0)
+ return r;
+
+ r = RET_NERRNO(symlink(old_path, new_path));
+ mac_selinux_create_file_clear();
+
+ if (r < 0)
+ return r;
+
+ return mac_smack_fix(new_path, 0);
+}
+
+int symlink_atomic_full_label(const char *from, const char *to, bool make_relative) {
+ int r;
+
+ assert(from);
+ assert(to);
+
+ r = mac_selinux_create_file_prepare(to, S_IFLNK);
+ if (r < 0)
+ return r;
+
+ r = symlinkat_atomic_full(from, AT_FDCWD, to, make_relative);
+ mac_selinux_create_file_clear();
+
+ if (r < 0)
+ return r;
+
+ return mac_smack_fix(to, 0);
+}
+
+int mknod_label(const char *pathname, mode_t mode, dev_t dev) {
+ int r;
+
+ assert(pathname);
+
+ r = mac_selinux_create_file_prepare(pathname, mode);
+ if (r < 0)
+ return r;
+
+ r = RET_NERRNO(mknod(pathname, mode, dev));
+ mac_selinux_create_file_clear();
+
+ if (r < 0)
+ return r;
+
+ return mac_smack_fix(pathname, 0);
+}
+
+int btrfs_subvol_make_label(const char *path) {
+ int r;
+
+ assert(path);
+
+ r = mac_selinux_create_file_prepare(path, S_IFDIR);
+ if (r < 0)
+ return r;
+
+ r = btrfs_subvol_make(AT_FDCWD, path);
+ mac_selinux_create_file_clear();
+
+ if (r < 0)
+ return r;
+
+ return mac_smack_fix(path, 0);
+}
+
+static int init_internal(bool lazy) {
+ int r;
+
+ assert(!(mac_selinux_use() && mac_smack_use()));
+
+ if (lazy)
+ r = mac_selinux_init_lazy();
+ else
+ r = mac_selinux_init();
+ if (r < 0)
+ return r;
+
+ return mac_smack_init();
+}
+
+int mac_init_lazy(void) {
+ return init_internal(/* lazy=*/ true);
+}
+
+int mac_init(void) {
+ return init_internal(/* lazy=*/ false);
+}
diff --git a/src/shared/label-util.h b/src/shared/label-util.h
new file mode 100644
index 0000000..7fb98c7
--- /dev/null
+++ b/src/shared/label-util.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <sys/types.h>
+
+typedef enum LabelFixFlags {
+ LABEL_IGNORE_ENOENT = 1 << 0,
+ LABEL_IGNORE_EROFS = 1 << 1,
+} LabelFixFlags;
+
+int label_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags);
+
+static inline int label_fix(const char *path, LabelFixFlags flags) {
+ return label_fix_full(AT_FDCWD, path, path, flags);
+}
+
+int symlink_label(const char *old_path, const char *new_path);
+int symlink_atomic_full_label(const char *from, const char *to, bool make_relative);
+static inline int symlink_atomic_label(const char *from, const char *to) {
+ return symlink_atomic_full_label(from, to, false);
+}
+int mknod_label(const char *pathname, mode_t mode, dev_t dev);
+
+int btrfs_subvol_make_label(const char *path);
+
+int mac_init(void);
+int mac_init_lazy(void);
diff --git a/src/shared/libcrypt-util.c b/src/shared/libcrypt-util.c
new file mode 100644
index 0000000..81e6f17
--- /dev/null
+++ b/src/shared/libcrypt-util.c
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_CRYPT_H
+/* libxcrypt is a replacement for glibc's libcrypt, and libcrypt might be
+ * removed from glibc at some point. As part of the removal, defines for
+ * crypt(3) are dropped from unistd.h, and we must include crypt.h instead.
+ *
+ * Newer versions of glibc (v2.0+) already ship crypt.h with a definition
+ * of crypt(3) as well, so we simply include it if it is present. MariaDB,
+ * MySQL, PostgreSQL, Perl and some other wide-spread packages do it the
+ * same way since ages without any problems.
+ */
+# include <crypt.h>
+#else
+# include <unistd.h>
+#endif
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "libcrypt-util.h"
+#include "log.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "missing_stdlib.h"
+#include "random-util.h"
+#include "string-util.h"
+#include "strv.h"
+
+int make_salt(char **ret) {
+
+#if HAVE_CRYPT_GENSALT_RA
+ const char *e;
+ char *salt;
+
+ /* If we have crypt_gensalt_ra() we default to the "preferred method" (i.e. usually yescrypt).
+ * crypt_gensalt_ra() is usually provided by libxcrypt. */
+
+ e = secure_getenv("SYSTEMD_CRYPT_PREFIX");
+ if (!e)
+#if HAVE_CRYPT_PREFERRED_METHOD
+ e = crypt_preferred_method();
+#else
+ e = "$6$";
+#endif
+
+ log_debug("Generating salt for hash prefix: %s", e);
+
+ salt = crypt_gensalt_ra(e, 0, NULL, 0);
+ if (!salt)
+ return -errno;
+
+ *ret = salt;
+ return 0;
+#else
+ /* If crypt_gensalt_ra() is not available, we use SHA512 and generate the salt on our own. */
+
+ static const char table[] =
+ "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "0123456789"
+ "./";
+
+ uint8_t raw[16];
+ char *salt, *j;
+ size_t i;
+ int r;
+
+ /* This is a bit like crypt_gensalt_ra(), but doesn't require libcrypt, and doesn't do anything but
+ * SHA512, i.e. is legacy-free and minimizes our deps. */
+
+ assert_cc(sizeof(table) == 64U + 1U);
+
+ log_debug("Generating fallback salt for hash prefix: $6$");
+
+ /* Insist on the best randomness by setting RANDOM_BLOCK, this is about keeping passwords secret after all. */
+ r = crypto_random_bytes(raw, sizeof(raw));
+ if (r < 0)
+ return r;
+
+ salt = new(char, 3+sizeof(raw)+1+1);
+ if (!salt)
+ return -ENOMEM;
+
+ /* We only bother with SHA512 hashed passwords, the rest is legacy, and we don't do legacy. */
+ j = stpcpy(salt, "$6$");
+ for (i = 0; i < sizeof(raw); i++)
+ j[i] = table[raw[i] & 63];
+ j[i++] = '$';
+ j[i] = 0;
+
+ *ret = salt;
+ return 0;
+#endif
+}
+
+#if HAVE_CRYPT_RA
+# define CRYPT_RA_NAME "crypt_ra"
+#else
+# define CRYPT_RA_NAME "crypt_r"
+
+/* Provide a poor man's fallback that uses a fixed size buffer. */
+
+static char* systemd_crypt_ra(const char *phrase, const char *setting, void **data, int *size) {
+ assert(data);
+ assert(size);
+
+ /* We allocate the buffer because crypt(3) says: struct crypt_data may be quite large (32kB in this
+ * implementation of libcrypt; over 128kB in some other implementations). This is large enough that
+ * it may be unwise to allocate it on the stack. */
+
+ if (!*data) {
+ *data = new0(struct crypt_data, 1);
+ if (!*data) {
+ errno = -ENOMEM;
+ return NULL;
+ }
+
+ *size = (int) (sizeof(struct crypt_data));
+ }
+
+ char *t = crypt_r(phrase, setting, *data);
+ if (!t)
+ return NULL;
+
+ /* crypt_r may return a pointer to an invalid hashed password on error. Our callers expect NULL on
+ * error, so let's just return that. */
+ if (t[0] == '*')
+ return NULL;
+
+ return t;
+}
+
+#define crypt_ra systemd_crypt_ra
+
+#endif
+
+int hash_password_full(const char *password, void **cd_data, int *cd_size, char **ret) {
+ _cleanup_free_ char *salt = NULL;
+ _cleanup_(erase_and_freep) void *_cd_data = NULL;
+ char *p;
+ int r, _cd_size = 0;
+
+ assert(!!cd_data == !!cd_size);
+
+ r = make_salt(&salt);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to generate salt: %m");
+
+ errno = 0;
+ p = crypt_ra(password, salt, cd_data ?: &_cd_data, cd_size ?: &_cd_size);
+ if (!p)
+ return log_debug_errno(errno_or_else(SYNTHETIC_ERRNO(EINVAL)),
+ CRYPT_RA_NAME "() failed: %m");
+
+ p = strdup(p);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return 0;
+}
+
+bool looks_like_hashed_password(const char *s) {
+ /* Returns false if the specified string is certainly not a hashed UNIX password. crypt(5) lists
+ * various hashing methods. We only reject (return false) strings which are documented to have
+ * different meanings.
+ *
+ * In particular, we allow locked passwords, i.e. strings starting with "!", including just "!",
+ * i.e. the locked empty password. See also fc58c0c7bf7e4f525b916e3e5be0de2307fef04e.
+ */
+ if (!s)
+ return false;
+
+ s += strspn(s, "!"); /* Skip (possibly duplicated) locking prefix */
+
+ return !STR_IN_SET(s, "x", "*");
+}
+
+int test_password_one(const char *hashed_password, const char *password) {
+ _cleanup_(erase_and_freep) void *cd_data = NULL;
+ int cd_size = 0;
+ const char *k;
+
+ errno = 0;
+ k = crypt_ra(password, hashed_password, &cd_data, &cd_size);
+ if (!k) {
+ if (errno == ENOMEM)
+ return -ENOMEM;
+ /* Unknown or unavailable hashing method or string too short */
+ return 0;
+ }
+
+ return streq(k, hashed_password);
+}
+
+int test_password_many(char **hashed_password, const char *password) {
+ int r;
+
+ STRV_FOREACH(hpw, hashed_password) {
+ r = test_password_one(*hpw, password);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/shared/libcrypt-util.h b/src/shared/libcrypt-util.h
new file mode 100644
index 0000000..5b9b945
--- /dev/null
+++ b/src/shared/libcrypt-util.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+int make_salt(char **ret);
+int hash_password_full(const char *password, void **cd_data, int *cd_size, char **ret);
+static inline int hash_password(const char *password, char **ret) {
+ return hash_password_full(password, NULL, NULL, ret);
+}
+bool looks_like_hashed_password(const char *s);
+int test_password_one(const char *hashed_password, const char *password);
+int test_password_many(char **hashed_password, const char *password);
diff --git a/src/shared/libfido2-util.c b/src/shared/libfido2-util.c
new file mode 100644
index 0000000..1cc3afe
--- /dev/null
+++ b/src/shared/libfido2-util.c
@@ -0,0 +1,1296 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "libfido2-util.h"
+
+#if HAVE_LIBFIDO2
+#include "alloc-util.h"
+#include "ask-password-api.h"
+#include "dlfcn-util.h"
+#include "format-table.h"
+#include "glyph-util.h"
+#include "log.h"
+#include "memory-util.h"
+#include "random-util.h"
+#include "strv.h"
+#include "unistd.h"
+
+static void *libfido2_dl = NULL;
+
+int (*sym_fido_assert_allow_cred)(fido_assert_t *, const unsigned char *, size_t) = NULL;
+void (*sym_fido_assert_free)(fido_assert_t **) = NULL;
+size_t (*sym_fido_assert_hmac_secret_len)(const fido_assert_t *, size_t) = NULL;
+const unsigned char* (*sym_fido_assert_hmac_secret_ptr)(const fido_assert_t *, size_t) = NULL;
+fido_assert_t* (*sym_fido_assert_new)(void) = NULL;
+int (*sym_fido_assert_set_clientdata_hash)(fido_assert_t *, const unsigned char *, size_t) = NULL;
+int (*sym_fido_assert_set_extensions)(fido_assert_t *, int) = NULL;
+int (*sym_fido_assert_set_hmac_salt)(fido_assert_t *, const unsigned char *, size_t) = NULL;
+int (*sym_fido_assert_set_rp)(fido_assert_t *, const char *) = NULL;
+int (*sym_fido_assert_set_up)(fido_assert_t *, fido_opt_t) = NULL;
+int (*sym_fido_assert_set_uv)(fido_assert_t *, fido_opt_t) = NULL;
+size_t (*sym_fido_cbor_info_extensions_len)(const fido_cbor_info_t *) = NULL;
+char **(*sym_fido_cbor_info_extensions_ptr)(const fido_cbor_info_t *) = NULL;
+void (*sym_fido_cbor_info_free)(fido_cbor_info_t **) = NULL;
+fido_cbor_info_t* (*sym_fido_cbor_info_new)(void) = NULL;
+size_t (*sym_fido_cbor_info_options_len)(const fido_cbor_info_t *) = NULL;
+char** (*sym_fido_cbor_info_options_name_ptr)(const fido_cbor_info_t *) = NULL;
+const bool* (*sym_fido_cbor_info_options_value_ptr)(const fido_cbor_info_t *) = NULL;
+void (*sym_fido_cred_free)(fido_cred_t **) = NULL;
+size_t (*sym_fido_cred_id_len)(const fido_cred_t *) = NULL;
+const unsigned char* (*sym_fido_cred_id_ptr)(const fido_cred_t *) = NULL;
+fido_cred_t* (*sym_fido_cred_new)(void) = NULL;
+int (*sym_fido_cred_set_clientdata_hash)(fido_cred_t *, const unsigned char *, size_t) = NULL;
+int (*sym_fido_cred_set_extensions)(fido_cred_t *, int) = NULL;
+int (*sym_fido_cred_set_rk)(fido_cred_t *, fido_opt_t) = NULL;
+int (*sym_fido_cred_set_rp)(fido_cred_t *, const char *, const char *) = NULL;
+int (*sym_fido_cred_set_type)(fido_cred_t *, int) = NULL;
+int (*sym_fido_cred_set_user)(fido_cred_t *, const unsigned char *, size_t, const char *, const char *, const char *) = NULL;
+int (*sym_fido_cred_set_uv)(fido_cred_t *, fido_opt_t) = NULL;
+void (*sym_fido_dev_free)(fido_dev_t **) = NULL;
+int (*sym_fido_dev_get_assert)(fido_dev_t *, fido_assert_t *, const char *) = NULL;
+int (*sym_fido_dev_get_cbor_info)(fido_dev_t *, fido_cbor_info_t *) = NULL;
+void (*sym_fido_dev_info_free)(fido_dev_info_t **, size_t) = NULL;
+int (*sym_fido_dev_info_manifest)(fido_dev_info_t *, size_t, size_t *) = NULL;
+const char* (*sym_fido_dev_info_manufacturer_string)(const fido_dev_info_t *) = NULL;
+const char* (*sym_fido_dev_info_product_string)(const fido_dev_info_t *) = NULL;
+fido_dev_info_t* (*sym_fido_dev_info_new)(size_t) = NULL;
+const char* (*sym_fido_dev_info_path)(const fido_dev_info_t *) = NULL;
+const fido_dev_info_t* (*sym_fido_dev_info_ptr)(const fido_dev_info_t *, size_t) = NULL;
+bool (*sym_fido_dev_is_fido2)(const fido_dev_t *) = NULL;
+int (*sym_fido_dev_make_cred)(fido_dev_t *, fido_cred_t *, const char *) = NULL;
+fido_dev_t* (*sym_fido_dev_new)(void) = NULL;
+int (*sym_fido_dev_open)(fido_dev_t *, const char *) = NULL;
+int (*sym_fido_dev_close)(fido_dev_t *) = NULL;
+void (*sym_fido_init)(int) = NULL;
+void (*sym_fido_set_log_handler)(fido_log_handler_t *) = NULL;
+const char* (*sym_fido_strerr)(int) = NULL;
+
+static void fido_log_propagate_handler(const char *s) {
+ log_debug("libfido2: %s", strempty(s));
+}
+
+int dlopen_libfido2(void) {
+ int r;
+
+ r = dlopen_many_sym_or_warn(
+ &libfido2_dl, "libfido2.so.1", LOG_DEBUG,
+ DLSYM_ARG(fido_assert_allow_cred),
+ DLSYM_ARG(fido_assert_free),
+ DLSYM_ARG(fido_assert_hmac_secret_len),
+ DLSYM_ARG(fido_assert_hmac_secret_ptr),
+ DLSYM_ARG(fido_assert_new),
+ DLSYM_ARG(fido_assert_set_clientdata_hash),
+ DLSYM_ARG(fido_assert_set_extensions),
+ DLSYM_ARG(fido_assert_set_hmac_salt),
+ DLSYM_ARG(fido_assert_set_rp),
+ DLSYM_ARG(fido_assert_set_up),
+ DLSYM_ARG(fido_assert_set_uv),
+ DLSYM_ARG(fido_cbor_info_extensions_len),
+ DLSYM_ARG(fido_cbor_info_extensions_ptr),
+ DLSYM_ARG(fido_cbor_info_free),
+ DLSYM_ARG(fido_cbor_info_new),
+ DLSYM_ARG(fido_cbor_info_options_len),
+ DLSYM_ARG(fido_cbor_info_options_name_ptr),
+ DLSYM_ARG(fido_cbor_info_options_value_ptr),
+ DLSYM_ARG(fido_cred_free),
+ DLSYM_ARG(fido_cred_id_len),
+ DLSYM_ARG(fido_cred_id_ptr),
+ DLSYM_ARG(fido_cred_new),
+ DLSYM_ARG(fido_cred_set_clientdata_hash),
+ DLSYM_ARG(fido_cred_set_extensions),
+ DLSYM_ARG(fido_cred_set_rk),
+ DLSYM_ARG(fido_cred_set_rp),
+ DLSYM_ARG(fido_cred_set_type),
+ DLSYM_ARG(fido_cred_set_user),
+ DLSYM_ARG(fido_cred_set_uv),
+ DLSYM_ARG(fido_dev_free),
+ DLSYM_ARG(fido_dev_get_assert),
+ DLSYM_ARG(fido_dev_get_cbor_info),
+ DLSYM_ARG(fido_dev_info_free),
+ DLSYM_ARG(fido_dev_info_manifest),
+ DLSYM_ARG(fido_dev_info_manufacturer_string),
+ DLSYM_ARG(fido_dev_info_new),
+ DLSYM_ARG(fido_dev_info_path),
+ DLSYM_ARG(fido_dev_info_product_string),
+ DLSYM_ARG(fido_dev_info_ptr),
+ DLSYM_ARG(fido_dev_is_fido2),
+ DLSYM_ARG(fido_dev_make_cred),
+ DLSYM_ARG(fido_dev_new),
+ DLSYM_ARG(fido_dev_open),
+ DLSYM_ARG(fido_dev_close),
+ DLSYM_ARG(fido_init),
+ DLSYM_ARG(fido_set_log_handler),
+ DLSYM_ARG(fido_strerr));
+ if (r < 0)
+ return r;
+
+ sym_fido_init(FIDO_DEBUG);
+ sym_fido_set_log_handler(fido_log_propagate_handler);
+
+ return 0;
+}
+
+static int verify_features(
+ fido_dev_t *d,
+ const char *path,
+ int log_level, /* the log level to use when device is not FIDO2 with hmac-secret */
+ bool *ret_has_rk,
+ bool *ret_has_client_pin,
+ bool *ret_has_up,
+ bool *ret_has_uv) {
+
+ _cleanup_(fido_cbor_info_free_wrapper) fido_cbor_info_t *di = NULL;
+ bool found_extension = false;
+ char **e, **o;
+ const bool *b;
+ bool has_rk = false, has_client_pin = false, has_up = true, has_uv = false; /* Defaults are per table in 5.4 in FIDO2 spec */
+ size_t n;
+ int r;
+
+ assert(d);
+ assert(path);
+
+ if (!sym_fido_dev_is_fido2(d))
+ return log_full_errno(log_level, SYNTHETIC_ERRNO(ENODEV),
+ "Specified device %s is not a FIDO2 device.", path);
+
+ di = sym_fido_cbor_info_new();
+ if (!di)
+ return log_oom();
+
+ r = sym_fido_dev_get_cbor_info(d, di);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to get CBOR device info for %s: %s", path, sym_fido_strerr(r));
+
+ e = sym_fido_cbor_info_extensions_ptr(di);
+ n = sym_fido_cbor_info_extensions_len(di);
+ for (size_t i = 0; i < n; i++) {
+ log_debug("FIDO2 device implements extension: %s", e[i]);
+ if (streq(e[i], "hmac-secret"))
+ found_extension = true;
+ }
+
+ o = sym_fido_cbor_info_options_name_ptr(di);
+ b = sym_fido_cbor_info_options_value_ptr(di);
+ n = sym_fido_cbor_info_options_len(di);
+ for (size_t i = 0; i < n; i++) {
+ log_debug("FIDO2 device implements option %s: %s", o[i], yes_no(b[i]));
+ if (streq(o[i], "rk"))
+ has_rk = b[i];
+ if (streq(o[i], "clientPin"))
+ has_client_pin = b[i];
+ if (streq(o[i], "up"))
+ has_up = b[i];
+ if (streq(o[i], "uv"))
+ has_uv = b[i];
+ }
+
+ if (!found_extension)
+ return log_full_errno(log_level,
+ SYNTHETIC_ERRNO(ENODEV),
+ "Specified device %s is a FIDO2 device, but does not support the required HMAC-SECRET extension.", path);
+
+ log_debug("Has rk ('Resident Key') support: %s\n"
+ "Has clientPin support: %s\n"
+ "Has up ('User Presence') support: %s\n"
+ "Has uv ('User Verification') support: %s\n",
+ yes_no(has_rk),
+ yes_no(has_client_pin),
+ yes_no(has_up),
+ yes_no(has_uv));
+
+ if (ret_has_rk)
+ *ret_has_rk = has_rk;
+ if (ret_has_client_pin)
+ *ret_has_client_pin = has_client_pin;
+ if (ret_has_up)
+ *ret_has_up = has_up;
+ if (ret_has_uv)
+ *ret_has_uv = has_uv;
+
+ return 0;
+}
+
+static int fido2_assert_set_basic_properties(
+ fido_assert_t *a,
+ const char *rp_id,
+ const void *cid,
+ size_t cid_size) {
+ int r;
+
+ assert(a);
+ assert(rp_id);
+ assert(cid);
+ assert(cid_size > 0);
+
+ r = sym_fido_assert_set_rp(a, rp_id);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set FIDO2 assertion ID: %s", sym_fido_strerr(r));
+
+ r = sym_fido_assert_set_clientdata_hash(a, (const unsigned char[32]) {}, 32);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set FIDO2 assertion client data hash: %s", sym_fido_strerr(r));
+
+ r = sym_fido_assert_allow_cred(a, cid, cid_size);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to add FIDO2 assertion credential ID: %s", sym_fido_strerr(r));
+
+ return 0;
+}
+
+static int fido2_common_assert_error_handle(int r) {
+ switch (r) {
+ case FIDO_OK:
+ return 0;
+ case FIDO_ERR_NO_CREDENTIALS:
+ return log_error_errno(SYNTHETIC_ERRNO(EBADSLT),
+ "Wrong security token; needed credentials not present on token.");
+ case FIDO_ERR_PIN_REQUIRED:
+ return log_error_errno(SYNTHETIC_ERRNO(ENOANO),
+ "Security token requires PIN.");
+ case FIDO_ERR_PIN_AUTH_BLOCKED:
+ return log_error_errno(SYNTHETIC_ERRNO(EOWNERDEAD),
+ "PIN of security token is blocked, please remove/reinsert token.");
+#ifdef FIDO_ERR_UV_BLOCKED
+ case FIDO_ERR_UV_BLOCKED:
+ return log_error_errno(SYNTHETIC_ERRNO(EOWNERDEAD),
+ "Verification of security token is blocked, please remove/reinsert token.");
+#endif
+ case FIDO_ERR_PIN_INVALID:
+ return log_error_errno(SYNTHETIC_ERRNO(ENOLCK),
+ "PIN of security token incorrect.");
+ case FIDO_ERR_UP_REQUIRED:
+ return log_error_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE),
+ "User presence required.");
+ case FIDO_ERR_ACTION_TIMEOUT:
+ return log_error_errno(SYNTHETIC_ERRNO(ENOSTR),
+ "Token action timeout. (User didn't interact with token quickly enough.)");
+ default:
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to ask token for assertion: %s", sym_fido_strerr(r));
+ }
+}
+
+static int fido2_is_cred_in_specific_token(
+ const char *path,
+ const char *rp_id,
+ const void *cid,
+ size_t cid_size,
+ Fido2EnrollFlags flags) {
+
+ assert(path);
+ assert(rp_id);
+ assert(cid);
+ assert(cid_size);
+
+ _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL;
+ _cleanup_(fido_assert_free_wrapper) fido_assert_t *a = NULL;
+ bool has_up = false, has_uv = false;
+ int r;
+
+ d = sym_fido_dev_new();
+ if (!d)
+ return log_oom();
+
+ r = sym_fido_dev_open(d, path);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to open FIDO2 device %s: %s", path, sym_fido_strerr(r));
+
+ r = verify_features(d, path, LOG_ERR, NULL, NULL, &has_up, &has_uv);
+ if (r == -ENODEV) { /* Not a FIDO2 device or lacking HMAC-SECRET extension */
+ log_debug_errno(r, "%s is not a FIDO2 device, or it lacks the hmac-secret extension", path);
+ return false;
+ }
+ if (r < 0)
+ return r;
+
+ a = sym_fido_assert_new();
+ if (!a)
+ return log_oom();
+
+ r = fido2_assert_set_basic_properties(a, rp_id, cid, cid_size);
+ if (r < 0)
+ return r;
+
+ /* FIDO2 devices may not support pre-flight requests with UV, at least not
+ * without user interaction [1]. As a result, let's just return true
+ * here and go ahead with trying the unlock directly.
+ * Reference:
+ * 1: https://fidoalliance.org/specs/fido-v2.1-ps-20210615/fido-client-to-authenticator-protocol-v2.1-ps-20210615.html#sctn-getAssert-authnr-alg
+ * See section 7.4 */
+ if (has_uv && FLAGS_SET(flags, FIDO2ENROLL_UV)) {
+ log_debug("Pre-flight requests with UV are unsupported, device: %s", path);
+ return true;
+ }
+
+ /* According to CTAP 2.1 specification, to do pre-flight we need to set up option to false
+ * with optionally pinUvAuthParam in assertion[1]. But for authenticator that doesn't support
+ * user presence, once up option is present, the authenticator may return CTAP2_ERR_UNSUPPORTED_OPTION[2].
+ * So we simplely omit the option in that case.
+ * Reference:
+ * 1: https://fidoalliance.org/specs/fido-v2.1-ps-20210615/fido-client-to-authenticator-protocol-v2.1-ps-20210615.html#pre-flight
+ * 2: https://fidoalliance.org/specs/fido-v2.0-ps-20190130/fido-client-to-authenticator-protocol-v2.0-ps-20190130.html#authenticatorGetAssertion (in step 5)
+ */
+ if (has_up)
+ r = sym_fido_assert_set_up(a, FIDO_OPT_FALSE);
+ else
+ r = sym_fido_assert_set_up(a, FIDO_OPT_OMIT);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set assertion user presence: %s", sym_fido_strerr(r));
+
+ r = sym_fido_dev_get_assert(d, a, NULL);
+
+ switch (r) {
+ case FIDO_OK:
+ return true;
+ case FIDO_ERR_NO_CREDENTIALS:
+ return false;
+ default:
+ return fido2_common_assert_error_handle(r);
+ }
+}
+
+static int fido2_use_hmac_hash_specific_token(
+ const char *path,
+ const char *rp_id,
+ const void *salt,
+ size_t salt_size,
+ const void *cid,
+ size_t cid_size,
+ char **pins,
+ Fido2EnrollFlags required, /* client pin/user presence required */
+ void **ret_hmac,
+ size_t *ret_hmac_size) {
+
+ _cleanup_(fido_assert_free_wrapper) fido_assert_t *a = NULL;
+ _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL;
+ _cleanup_(erase_and_freep) void *hmac_copy = NULL;
+ bool has_up, has_client_pin, has_uv;
+ size_t hmac_size;
+ const void *hmac;
+ int r;
+
+ assert(path);
+ assert(rp_id);
+ assert(salt);
+ assert(cid);
+ assert(ret_hmac);
+ assert(ret_hmac_size);
+
+ d = sym_fido_dev_new();
+ if (!d)
+ return log_oom();
+
+ r = sym_fido_dev_open(d, path);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to open FIDO2 device %s: %s", path, sym_fido_strerr(r));
+
+ r = verify_features(d, path, LOG_ERR, NULL, &has_client_pin, &has_up, &has_uv);
+ if (r < 0)
+ return r;
+
+ if (!has_client_pin && FLAGS_SET(required, FIDO2ENROLL_PIN))
+ return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON),
+ "PIN required to unlock, but FIDO2 device %s does not support it.",
+ path);
+
+ if (!has_up && FLAGS_SET(required, FIDO2ENROLL_UP))
+ return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON),
+ "User presence test required to unlock, but FIDO2 device %s does not support it.",
+ path);
+
+ if (!has_uv && FLAGS_SET(required, FIDO2ENROLL_UV))
+ return log_error_errno(SYNTHETIC_ERRNO(EHWPOISON),
+ "User verification required to unlock, but FIDO2 device %s does not support it.",
+ path);
+
+ a = sym_fido_assert_new();
+ if (!a)
+ return log_oom();
+
+ r = sym_fido_assert_set_extensions(a, FIDO_EXT_HMAC_SECRET);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to enable HMAC-SECRET extension on FIDO2 assertion: %s", sym_fido_strerr(r));
+
+ r = sym_fido_assert_set_hmac_salt(a, salt, salt_size);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set salt on FIDO2 assertion: %s", sym_fido_strerr(r));
+
+ r = fido2_assert_set_basic_properties(a, rp_id, cid, cid_size);
+ if (r < 0)
+ return r;
+
+ log_info("Asking FIDO2 token for authentication.");
+
+ if (has_up) {
+ r = sym_fido_assert_set_up(a, FLAGS_SET(required, FIDO2ENROLL_UP) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to %s FIDO2 user presence test: %s",
+ enable_disable(FLAGS_SET(required, FIDO2ENROLL_UP)),
+ sym_fido_strerr(r));
+
+ if (FLAGS_SET(required, FIDO2ENROLL_UP))
+ log_notice("%s%sPlease confirm presence on security token to unlock.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ }
+
+ if (has_uv && !FLAGS_SET(required, FIDO2ENROLL_UV_OMIT)) {
+ r = sym_fido_assert_set_uv(a, FLAGS_SET(required, FIDO2ENROLL_UV) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to %s FIDO2 user verification: %s",
+ enable_disable(FLAGS_SET(required, FIDO2ENROLL_UV)),
+ sym_fido_strerr(r));
+
+ if (FLAGS_SET(required, FIDO2ENROLL_UV))
+ log_notice("%s%sPlease verify user on security token to unlock.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ }
+
+ for (;;) {
+ bool retry_with_up = false, retry_with_pin = false;
+
+ if (FLAGS_SET(required, FIDO2ENROLL_PIN)) {
+ /* OK, we need a pin, try with all pins in turn */
+ if (strv_isempty(pins))
+ r = FIDO_ERR_PIN_REQUIRED;
+ else
+ STRV_FOREACH(i, pins) {
+ r = sym_fido_dev_get_assert(d, a, *i);
+ if (r != FIDO_ERR_PIN_INVALID)
+ break;
+ }
+
+ } else
+ r = sym_fido_dev_get_assert(d, a, NULL);
+
+ /* In some conditions, where a PIN or UP is required we might accept that. Let's check the
+ * conditions and if so try immediately again. */
+
+ switch (r) {
+
+ case FIDO_ERR_UP_REQUIRED:
+ /* So the token asked for "up". Try to turn it on, for compat with systemd 248 and try again. */
+
+ if (!has_up)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for user presence test but doesn't advertise 'up' feature.");
+
+ if (FLAGS_SET(required, FIDO2ENROLL_UP))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for user presence test but was already enabled.");
+
+ if (FLAGS_SET(required, FIDO2ENROLL_UP_IF_NEEDED)) {
+ log_notice("%s%sPlease confirm presence on security to unlock.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ retry_with_up = true;
+ }
+
+ break;
+
+ case FIDO_ERR_UNSUPPORTED_OPTION:
+ /* AuthenTrend ATKey.Pro returns this instead of FIDO_ERR_UP_REQUIRED, let's handle
+ * it gracefully (also see below.) */
+
+ if (has_up && (required & (FIDO2ENROLL_UP|FIDO2ENROLL_UP_IF_NEEDED)) == FIDO2ENROLL_UP_IF_NEEDED) {
+ log_notice("%s%sGot unsupported option error when user presence test is turned off. Trying with user presence test turned on.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ retry_with_up = true;
+ }
+
+ break;
+
+ case FIDO_ERR_PIN_REQUIRED:
+ /* A pin was requested. Maybe supply one, if we are configured to do so on request */
+
+ if (!has_client_pin)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for PIN but doesn't advertise 'clientPin' feature.");
+
+ if (FLAGS_SET(required, FIDO2ENROLL_PIN) && !strv_isempty(pins))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for PIN but one was already supplied.");
+
+ if ((required & (FIDO2ENROLL_PIN|FIDO2ENROLL_PIN_IF_NEEDED)) == FIDO2ENROLL_PIN_IF_NEEDED) {
+ /* If a PIN so far wasn't specified but is requested by the device, and
+ * FIDO2ENROLL_PIN_IF_NEEDED is set, then provide it */
+ log_debug("Retrying to create credential with PIN.");
+ retry_with_pin = true;
+ }
+
+ break;
+
+ default:
+ break;
+ }
+
+ if (!retry_with_up && !retry_with_pin)
+ break;
+
+ if (retry_with_up) {
+ r = sym_fido_assert_set_up(a, FIDO_OPT_TRUE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to enable FIDO2 user presence test: %s", sym_fido_strerr(r));
+
+ required |= FIDO2ENROLL_UP;
+ }
+
+ if (retry_with_pin)
+ required |= FIDO2ENROLL_PIN;
+ }
+
+ r = fido2_common_assert_error_handle(r);
+ if (r < 0)
+ return r;
+
+ hmac = sym_fido_assert_hmac_secret_ptr(a, 0);
+ if (!hmac)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve HMAC secret.");
+
+ hmac_size = sym_fido_assert_hmac_secret_len(a, 0);
+
+ hmac_copy = memdup(hmac, hmac_size);
+ if (!hmac_copy)
+ return log_oom();
+
+ *ret_hmac = TAKE_PTR(hmac_copy);
+ *ret_hmac_size = hmac_size;
+ return 0;
+}
+
+/* COSE_ECDH_ES256 is not usable with fido_cred_set_type() thus it's not listed here. */
+static const char *fido2_algorithm_to_string(int alg) {
+ switch(alg) {
+ case COSE_ES256:
+ return "es256";
+ case COSE_RS256:
+ return "rs256";
+ case COSE_EDDSA:
+ return "eddsa";
+ default:
+ return NULL;
+ }
+}
+
+int fido2_use_hmac_hash(
+ const char *device,
+ const char *rp_id,
+ const void *salt,
+ size_t salt_size,
+ const void *cid,
+ size_t cid_size,
+ char **pins,
+ Fido2EnrollFlags required, /* client pin/user presence required */
+ void **ret_hmac,
+ size_t *ret_hmac_size) {
+
+ size_t allocated = 64, found = 0;
+ fido_dev_info_t *di = NULL;
+ int r;
+
+ r = dlopen_libfido2();
+ if (r < 0)
+ return log_error_errno(r, "FIDO2 support is not installed.");
+
+ if (device) {
+ r = fido2_is_cred_in_specific_token(device, rp_id, cid, cid_size, required);
+ if (r == 0)
+ /* The caller is expected to attempt other key slots in this case,
+ * therefore, do not spam the console with error logs here. */
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
+ "The credential is not in the token %s.", device);
+ if (r < 0)
+ return log_error_errno(r, "Token returned error during pre-flight: %m");
+
+ return fido2_use_hmac_hash_specific_token(device, rp_id, salt, salt_size, cid, cid_size, pins, required, ret_hmac, ret_hmac_size);
+ }
+
+ di = sym_fido_dev_info_new(allocated);
+ if (!di)
+ return log_oom();
+
+ r = sym_fido_dev_info_manifest(di, allocated, &found);
+ if (r == FIDO_ERR_INTERNAL) {
+ /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */
+ r = log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), "Got FIDO_ERR_INTERNAL, assuming no devices.");
+ goto finish;
+ }
+ if (r != FIDO_OK) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO2 devices: %s", sym_fido_strerr(r));
+ goto finish;
+ }
+
+ for (size_t i = 0; i < found; i++) {
+ const fido_dev_info_t *entry;
+ const char *path;
+
+ entry = sym_fido_dev_info_ptr(di, i);
+ if (!entry) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to get device information for FIDO device %zu.", i);
+ goto finish;
+ }
+
+ path = sym_fido_dev_info_path(entry);
+ if (!path) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to query FIDO device path.");
+ goto finish;
+ }
+
+ r = fido2_is_cred_in_specific_token(path, rp_id, cid, cid_size, required);
+ if (r < 0) {
+ log_error_errno(r, "Token returned error during pre-flight: %m");
+ goto finish;
+ }
+ if (r == 0) {
+ log_debug("The credential is not in the token %s, skipping.", path);
+ continue;
+ }
+
+ r = fido2_use_hmac_hash_specific_token(path, rp_id, salt, salt_size, cid, cid_size, pins, required, ret_hmac, ret_hmac_size);
+ if (!IN_SET(r,
+ -EBADSLT, /* device doesn't understand our credential hash */
+ -ENODEV /* device is not a FIDO2 device with HMAC-SECRET */))
+ goto finish;
+ }
+
+ r = -EAGAIN;
+
+finish:
+ sym_fido_dev_info_free(&di, allocated);
+ return r;
+}
+
+#define FIDO2_SALT_SIZE 32
+
+int fido2_generate_hmac_hash(
+ const char *device,
+ const char *rp_id,
+ const char *rp_name,
+ const void *user_id, size_t user_id_len,
+ const char *user_name,
+ const char *user_display_name,
+ const char *user_icon,
+ const char *askpw_icon_name,
+ Fido2EnrollFlags lock_with,
+ int cred_alg,
+ void **ret_cid, size_t *ret_cid_size,
+ void **ret_salt, size_t *ret_salt_size,
+ void **ret_secret, size_t *ret_secret_size,
+ char **ret_usedpin,
+ Fido2EnrollFlags *ret_locked_with) {
+
+ _cleanup_(erase_and_freep) void *salt = NULL, *secret_copy = NULL;
+ _cleanup_(fido_assert_free_wrapper) fido_assert_t *a = NULL;
+ _cleanup_(fido_cred_free_wrapper) fido_cred_t *c = NULL;
+ _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL;
+ _cleanup_(erase_and_freep) char *used_pin = NULL;
+ bool has_rk, has_client_pin, has_up, has_uv;
+ _cleanup_free_ char *cid_copy = NULL;
+ size_t cid_size, secret_size;
+ const void *cid, *secret;
+ int r;
+
+ assert(device);
+ assert(ret_cid);
+ assert(ret_cid_size);
+ assert(ret_salt);
+ assert(ret_salt_size);
+ assert(ret_secret);
+ assert(ret_secret_size);
+
+ /* Construction is like this: we generate a salt of 32 bytes. We then ask the FIDO2 device to
+ * HMAC-SHA256 it for us with its internal key. The result is the key used by LUKS and account
+ * authentication. LUKS and UNIX password auth all do their own salting before hashing, so that FIDO2
+ * device never sees the volume key.
+ *
+ * S = HMAC-SHA256(I, D)
+ *
+ * with: S → LUKS/account authentication key (never stored)
+ * I → internal key on FIDO2 device (stored in the FIDO2 device)
+ * D → salt we generate here (stored in the privileged part of the JSON record)
+ *
+ */
+
+ assert(device);
+ assert((lock_with & ~(FIDO2ENROLL_PIN|FIDO2ENROLL_UP|FIDO2ENROLL_UV)) == 0);
+
+ r = dlopen_libfido2();
+ if (r < 0)
+ return log_error_errno(r, "FIDO2 token support is not installed.");
+
+ salt = malloc(FIDO2_SALT_SIZE);
+ if (!salt)
+ return log_oom();
+
+ r = crypto_random_bytes(salt, FIDO2_SALT_SIZE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate salt: %m");
+
+ d = sym_fido_dev_new();
+ if (!d)
+ return log_oom();
+
+ r = sym_fido_dev_open(d, device);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to open FIDO2 device %s: %s", device, sym_fido_strerr(r));
+
+ r = verify_features(d, device, LOG_ERR, &has_rk, &has_client_pin, &has_up, &has_uv);
+ if (r < 0)
+ return r;
+
+ /* While enrolling degrade gracefully if the requested feature set isn't available, but let the user know */
+ if (!has_client_pin && FLAGS_SET(lock_with, FIDO2ENROLL_PIN)) {
+ log_notice("Requested to lock with PIN, but FIDO2 device %s does not support it, disabling.", device);
+ lock_with &= ~FIDO2ENROLL_PIN;
+ }
+
+ if (!has_up && FLAGS_SET(lock_with, FIDO2ENROLL_UP)) {
+ log_notice("Locking with user presence test requested, but FIDO2 device %s does not support it, disabling.", device);
+ lock_with &= ~FIDO2ENROLL_UP;
+ }
+
+ if (!has_uv && FLAGS_SET(lock_with, FIDO2ENROLL_UV)) {
+ log_notice("Locking with user verification test requested, but FIDO2 device %s does not support it, disabling.", device);
+ lock_with &= ~FIDO2ENROLL_UV;
+ }
+
+ c = sym_fido_cred_new();
+ if (!c)
+ return log_oom();
+
+ r = sym_fido_cred_set_extensions(c, FIDO_EXT_HMAC_SECRET);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to enable HMAC-SECRET extension on FIDO2 credential: %s", sym_fido_strerr(r));
+
+ r = sym_fido_cred_set_rp(c, rp_id, rp_name);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set FIDO2 credential relying party ID/name: %s", sym_fido_strerr(r));
+
+ r = sym_fido_cred_set_type(c, cred_alg);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set FIDO2 credential type to %s: %s", fido2_algorithm_to_string(cred_alg), sym_fido_strerr(r));
+
+ r = sym_fido_cred_set_user(
+ c,
+ user_id, user_id_len,
+ user_name,
+ user_display_name,
+ user_icon);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set FIDO2 credential user data: %s", sym_fido_strerr(r));
+
+ r = sym_fido_cred_set_clientdata_hash(c, (const unsigned char[32]) {}, 32);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set FIDO2 client data hash: %s", sym_fido_strerr(r));
+
+ if (has_rk) {
+ r = sym_fido_cred_set_rk(c, FIDO_OPT_FALSE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to turn off FIDO2 resident key option of credential: %s", sym_fido_strerr(r));
+ }
+
+ if (has_uv) {
+ r = sym_fido_cred_set_uv(c, FIDO_OPT_FALSE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to turn off FIDO2 user verification option of credential: %s", sym_fido_strerr(r));
+ }
+
+ /* As per specification "up" is assumed to be implicit when making credentials, hence we don't
+ * explicitly enable/disable it here */
+
+ log_info("Initializing FIDO2 credential on security token.");
+
+ if (has_uv || has_up)
+ log_notice("%s%s(Hint: This might require confirmation of user presence on security token.)",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+
+ r = sym_fido_dev_make_cred(d, c, NULL);
+ if (r == FIDO_ERR_PIN_REQUIRED) {
+
+ if (!has_client_pin)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for PIN but doesn't advertise 'clientPin' feature.");
+
+ for (;;) {
+ _cleanup_strv_free_erase_ char **pin = NULL;
+
+ r = ask_password_auto("Please enter security token PIN:", askpw_icon_name, NULL, "fido2-pin", "fido2-pin", USEC_INFINITY, 0, &pin);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire user PIN: %m");
+
+ r = FIDO_ERR_PIN_INVALID;
+ STRV_FOREACH(i, pin) {
+ if (isempty(*i)) {
+ log_notice("PIN may not be empty.");
+ continue;
+ }
+
+ r = sym_fido_dev_make_cred(d, c, *i);
+ if (r == FIDO_OK) {
+ used_pin = strdup(*i);
+ if (!used_pin)
+ return log_oom();
+ break;
+ }
+ if (r != FIDO_ERR_PIN_INVALID)
+ break;
+ }
+
+ if (r != FIDO_ERR_PIN_INVALID)
+ break;
+
+ log_notice("PIN incorrect, please try again.");
+ }
+ }
+ if (r == FIDO_ERR_PIN_AUTH_BLOCKED)
+ return log_notice_errno(SYNTHETIC_ERRNO(EPERM),
+ "Token PIN is currently blocked, please remove and reinsert token.");
+#ifdef FIDO_ERR_UV_BLOCKED
+ if (r == FIDO_ERR_UV_BLOCKED)
+ return log_notice_errno(SYNTHETIC_ERRNO(EPERM),
+ "Token verification is currently blocked, please remove and reinsert token.");
+#endif
+ if (r == FIDO_ERR_ACTION_TIMEOUT)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOSTR),
+ "Token action timeout. (User didn't interact with token quickly enough.)");
+ if (r == FIDO_ERR_UNSUPPORTED_ALGORITHM)
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Token doesn't support credential algorithm %s.", fido2_algorithm_to_string(cred_alg));
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to generate FIDO2 credential: %s", sym_fido_strerr(r));
+
+ cid = sym_fido_cred_id_ptr(c);
+ if (!cid)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to get FIDO2 credential ID.");
+
+ cid_size = sym_fido_cred_id_len(c);
+
+ a = sym_fido_assert_new();
+ if (!a)
+ return log_oom();
+
+ r = sym_fido_assert_set_extensions(a, FIDO_EXT_HMAC_SECRET);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to enable HMAC-SECRET extension on FIDO2 assertion: %s", sym_fido_strerr(r));
+
+ r = sym_fido_assert_set_hmac_salt(a, salt, FIDO2_SALT_SIZE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to set salt on FIDO2 assertion: %s", sym_fido_strerr(r));
+
+ r = fido2_assert_set_basic_properties(a, rp_id, cid, cid_size);
+ if (r < 0)
+ return r;
+
+ log_info("Generating secret key on FIDO2 security token.");
+
+ if (has_up) {
+ r = sym_fido_assert_set_up(a, FLAGS_SET(lock_with, FIDO2ENROLL_UP) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to %s FIDO2 user presence test: %s",
+ enable_disable(FLAGS_SET(lock_with, FIDO2ENROLL_UP)),
+ sym_fido_strerr(r));
+
+ if (FLAGS_SET(lock_with, FIDO2ENROLL_UP))
+ log_notice("%s%sIn order to allow secret key generation, please confirm presence on security token.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ }
+
+ if (has_uv) {
+ r = sym_fido_assert_set_uv(a, FLAGS_SET(lock_with, FIDO2ENROLL_UV) ? FIDO_OPT_TRUE : FIDO_OPT_FALSE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to %s FIDO user verification: %s",
+ enable_disable(FLAGS_SET(lock_with, FIDO2ENROLL_UV)),
+ sym_fido_strerr(r));
+
+ if (FLAGS_SET(lock_with, FIDO2ENROLL_UV))
+ log_notice("%s%sIn order to allow secret key generation, please verify user on security token.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ }
+
+ for (;;) {
+ bool retry_with_up = false, retry_with_pin = false;
+
+ r = sym_fido_dev_get_assert(d, a, FLAGS_SET(lock_with, FIDO2ENROLL_PIN) ? used_pin : NULL);
+
+ switch (r) {
+
+ case FIDO_ERR_UP_REQUIRED:
+ /* If the token asks for "up" when we turn off, then this might be a feature that
+ * isn't optional. Let's enable it */
+
+ if (!has_up)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for user presence test but doesn't advertise 'up' feature.");
+
+ if (FLAGS_SET(lock_with, FIDO2ENROLL_UP))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for user presence test but was already enabled.");
+
+ log_notice("%s%sLocking without user presence test requested, but FIDO2 device %s requires it, enabling.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "",
+ device);
+
+ retry_with_up = true;
+ break;
+
+ case FIDO_ERR_UNSUPPORTED_OPTION:
+ /* AuthenTrend ATKey.Pro says it supports "up", but if we disable it it will fail
+ * with FIDO_ERR_UNSUPPORTED_OPTION, probably because it isn't actually
+ * optional. Let's see if turning it on works. This is very similar to the
+ * FIDO_ERR_UP_REQUIRED case, but since the error is so vague we implement it
+ * slightly more defensively. */
+
+ if (has_up && !FLAGS_SET(lock_with, FIDO2ENROLL_UP)) {
+ log_notice("%s%sGot unsupported option error when user presence test is turned off. Trying with user presence test turned on.",
+ emoji_enabled() ? special_glyph(SPECIAL_GLYPH_TOUCH) : "",
+ emoji_enabled() ? " " : "");
+ retry_with_up = true;
+ }
+
+ break;
+
+ case FIDO_ERR_PIN_REQUIRED:
+ if (!has_client_pin)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for client PIN check but doesn't advertise 'clientPin' feature.");
+
+ if (FLAGS_SET(lock_with, FIDO2ENROLL_PIN))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Token asks for user client PIN check but was already enabled.");
+
+ log_debug("Token requires PIN for assertion, enabling.");
+ retry_with_pin = true;
+ break;
+
+ default:
+ break;
+ }
+
+ if (!retry_with_up && !retry_with_pin)
+ break;
+
+ if (retry_with_up) {
+ r = sym_fido_assert_set_up(a, FIDO_OPT_TRUE);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enable FIDO2 user presence test: %s", sym_fido_strerr(r));
+
+ lock_with |= FIDO2ENROLL_UP;
+ }
+
+ if (retry_with_pin)
+ lock_with |= FIDO2ENROLL_PIN;
+ }
+
+ if (r == FIDO_ERR_ACTION_TIMEOUT)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOSTR),
+ "Token action timeout. (User didn't interact with token quickly enough.)");
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to ask token for assertion: %s", sym_fido_strerr(r));
+
+ secret = sym_fido_assert_hmac_secret_ptr(a, 0);
+ if (!secret)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to retrieve HMAC secret.");
+
+ secret_size = sym_fido_assert_hmac_secret_len(a, 0);
+
+ secret_copy = memdup(secret, secret_size);
+ if (!secret_copy)
+ return log_oom();
+
+ cid_copy = memdup(cid, cid_size);
+ if (!cid_copy)
+ return log_oom();
+
+ *ret_cid = TAKE_PTR(cid_copy);
+ *ret_cid_size = cid_size;
+ *ret_salt = TAKE_PTR(salt);
+ *ret_salt_size = FIDO2_SALT_SIZE;
+ *ret_secret = TAKE_PTR(secret_copy);
+ *ret_secret_size = secret_size;
+
+ if (ret_usedpin)
+ *ret_usedpin = TAKE_PTR(used_pin);
+
+ if (ret_locked_with)
+ *ret_locked_with = lock_with;
+
+ return 0;
+}
+#endif
+
+#if HAVE_LIBFIDO2
+static int check_device_is_fido2_with_hmac_secret(const char *path) {
+ _cleanup_(fido_dev_free_wrapper) fido_dev_t *d = NULL;
+ int r;
+
+ d = sym_fido_dev_new();
+ if (!d)
+ return log_oom();
+
+ r = sym_fido_dev_open(d, path);
+ if (r != FIDO_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to open FIDO2 device %s: %s", path, sym_fido_strerr(r));
+
+ r = verify_features(d, path, LOG_DEBUG, NULL, NULL, NULL, NULL);
+ if (r == -ENODEV) /* Not a FIDO2 device, or not implementing 'hmac-secret' */
+ return false;
+ if (r < 0)
+ return r;
+
+ return true;
+}
+#endif
+
+int fido2_list_devices(void) {
+#if HAVE_LIBFIDO2
+ _cleanup_(table_unrefp) Table *t = NULL;
+ size_t allocated = 64, found = 0;
+ fido_dev_info_t *di = NULL;
+ int r;
+
+ r = dlopen_libfido2();
+ if (r < 0)
+ return log_error_errno(r, "FIDO2 token support is not installed.");
+
+ di = sym_fido_dev_info_new(allocated);
+ if (!di)
+ return log_oom();
+
+ r = sym_fido_dev_info_manifest(di, allocated, &found);
+ if (r == FIDO_ERR_INTERNAL || (r == FIDO_OK && found == 0)) {
+ /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */
+ log_info("No FIDO2 devices found.");
+ r = 0;
+ goto finish;
+ }
+ if (r != FIDO_OK) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO2 devices: %s", sym_fido_strerr(r));
+ goto finish;
+ }
+
+ t = table_new("path", "manufacturer", "product");
+ if (!t) {
+ r = log_oom();
+ goto finish;
+ }
+
+ for (size_t i = 0; i < found; i++) {
+ const fido_dev_info_t *entry;
+
+ entry = sym_fido_dev_info_ptr(di, i);
+ if (!entry) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to get device information for FIDO device %zu.", i);
+ goto finish;
+ }
+
+ r = check_device_is_fido2_with_hmac_secret(sym_fido_dev_info_path(entry));
+ if (r < 0)
+ goto finish;
+ if (!r)
+ continue;
+
+ r = table_add_many(
+ t,
+ TABLE_PATH, sym_fido_dev_info_path(entry),
+ TABLE_STRING, sym_fido_dev_info_manufacturer_string(entry),
+ TABLE_STRING, sym_fido_dev_info_product_string(entry));
+ if (r < 0) {
+ table_log_add_error(r);
+ goto finish;
+ }
+ }
+
+ r = table_print(t, stdout);
+ if (r < 0) {
+ log_error_errno(r, "Failed to show device table: %m");
+ goto finish;
+ }
+
+ r = 0;
+
+finish:
+ sym_fido_dev_info_free(&di, allocated);
+ return r;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "FIDO2 tokens not supported on this build.");
+#endif
+}
+
+int fido2_find_device_auto(char **ret) {
+#if HAVE_LIBFIDO2
+ _cleanup_free_ char *copy = NULL;
+ size_t di_size = 64, found = 0;
+ const fido_dev_info_t *entry;
+ fido_dev_info_t *di = NULL;
+ const char *path;
+ int r;
+
+ r = dlopen_libfido2();
+ if (r < 0)
+ return log_error_errno(r, "FIDO2 token support is not installed.");
+
+ di = sym_fido_dev_info_new(di_size);
+ if (!di)
+ return log_oom();
+
+ r = sym_fido_dev_info_manifest(di, di_size, &found);
+ if (r == FIDO_ERR_INTERNAL || (r == FIDO_OK && found == 0)) {
+ /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */
+ r = log_error_errno(SYNTHETIC_ERRNO(ENODEV), "No FIDO devices found.");
+ goto finish;
+ }
+ if (r != FIDO_OK) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO devices: %s", sym_fido_strerr(r));
+ goto finish;
+ }
+ if (found > 1) {
+ r = log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ), "More than one FIDO device found.");
+ goto finish;
+ }
+
+ entry = sym_fido_dev_info_ptr(di, 0);
+ if (!entry) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to get device information for FIDO device 0.");
+ goto finish;
+ }
+
+ r = check_device_is_fido2_with_hmac_secret(sym_fido_dev_info_path(entry));
+ if (r < 0)
+ goto finish;
+ if (!r) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "FIDO device discovered does not implement FIDO2 with 'hmac-secret' extension.");
+ goto finish;
+ }
+
+ path = sym_fido_dev_info_path(entry);
+ if (!path) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to query FIDO device path.");
+ goto finish;
+ }
+
+ copy = strdup(path);
+ if (!copy) {
+ r = log_oom();
+ goto finish;
+ }
+
+ *ret = TAKE_PTR(copy);
+ r = 0;
+
+finish:
+ sym_fido_dev_info_free(&di, di_size);
+ return r;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "FIDO2 tokens not supported on this build.");
+#endif
+}
+
+int fido2_have_device(const char *device) {
+#if HAVE_LIBFIDO2
+ size_t allocated = 64, found = 0;
+ fido_dev_info_t *di = NULL;
+ int r;
+
+ /* Return == 0 if not devices are found, > 0 if at least one is found */
+
+ r = dlopen_libfido2();
+ if (r < 0)
+ return log_error_errno(r, "FIDO2 support is not installed.");
+
+ if (device) {
+ if (access(device, F_OK) < 0) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_error_errno(errno, "Failed to determine whether device '%s' exists: %m", device);
+ }
+
+ return 1;
+ }
+
+ di = sym_fido_dev_info_new(allocated);
+ if (!di)
+ return log_oom();
+
+ r = sym_fido_dev_info_manifest(di, allocated, &found);
+ if (r == FIDO_ERR_INTERNAL) {
+ /* The library returns FIDO_ERR_INTERNAL when no devices are found. I wish it wouldn't. */
+ r = 0;
+ goto finish;
+ }
+ if (r != FIDO_OK) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to enumerate FIDO2 devices: %s", sym_fido_strerr(r));
+ goto finish;
+ }
+
+ r = found;
+
+finish:
+ sym_fido_dev_info_free(&di, allocated);
+ return r;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "FIDO2 tokens not supported on this build.");
+#endif
+}
+
+#if HAVE_LIBFIDO2
+int parse_fido2_algorithm(const char *s, int *ret) {
+ int a;
+
+ assert(s);
+
+ if (streq(s, "es256"))
+ a = COSE_ES256;
+ else if (streq(s, "rs256"))
+ a = COSE_RS256;
+ else if (streq(s, "eddsa"))
+ a = COSE_EDDSA;
+ else
+ return -EINVAL;
+
+ if (ret)
+ *ret = a;
+ return 0;
+}
+#endif
diff --git a/src/shared/libfido2-util.h b/src/shared/libfido2-util.h
new file mode 100644
index 0000000..4cfc95f
--- /dev/null
+++ b/src/shared/libfido2-util.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+typedef enum Fido2EnrollFlags {
+ FIDO2ENROLL_PIN = 1 << 0,
+ FIDO2ENROLL_UP = 1 << 1, /* User presence (ie: touching token) */
+ FIDO2ENROLL_UV = 1 << 2, /* User verification (ie: fingerprint) */
+ FIDO2ENROLL_PIN_IF_NEEDED = 1 << 3, /* If auth doesn't work without PIN ask for one, as in systemd 248 */
+ FIDO2ENROLL_UP_IF_NEEDED = 1 << 4, /* If auth doesn't work without UP, enable it, as in systemd 248 */
+ FIDO2ENROLL_UV_OMIT = 1 << 5, /* Leave "uv" untouched, as in systemd 248 */
+ _FIDO2ENROLL_TYPE_MAX,
+ _FIDO2ENROLL_TYPE_INVALID = -EINVAL,
+} Fido2EnrollFlags;
+
+#if HAVE_LIBFIDO2
+#include <fido.h>
+
+extern int (*sym_fido_assert_allow_cred)(fido_assert_t *, const unsigned char *, size_t);
+extern void (*sym_fido_assert_free)(fido_assert_t **);
+extern size_t (*sym_fido_assert_hmac_secret_len)(const fido_assert_t *, size_t);
+extern const unsigned char* (*sym_fido_assert_hmac_secret_ptr)(const fido_assert_t *, size_t);
+extern fido_assert_t* (*sym_fido_assert_new)(void);
+extern int (*sym_fido_assert_set_clientdata_hash)(fido_assert_t *, const unsigned char *, size_t);
+extern int (*sym_fido_assert_set_extensions)(fido_assert_t *, int);
+extern int (*sym_fido_assert_set_hmac_salt)(fido_assert_t *, const unsigned char *, size_t);
+extern int (*sym_fido_assert_set_rp)(fido_assert_t *, const char *);
+extern int (*sym_fido_assert_set_up)(fido_assert_t *, fido_opt_t);
+extern int (*sym_fido_assert_set_uv)(fido_assert_t *, fido_opt_t);
+extern size_t (*sym_fido_cbor_info_extensions_len)(const fido_cbor_info_t *);
+extern char **(*sym_fido_cbor_info_extensions_ptr)(const fido_cbor_info_t *);
+extern void (*sym_fido_cbor_info_free)(fido_cbor_info_t **);
+extern fido_cbor_info_t* (*sym_fido_cbor_info_new)(void);
+extern size_t (*sym_fido_cbor_info_options_len)(const fido_cbor_info_t *);
+extern char** (*sym_fido_cbor_info_options_name_ptr)(const fido_cbor_info_t *);
+extern const bool* (*sym_fido_cbor_info_options_value_ptr)(const fido_cbor_info_t *);
+extern void (*sym_fido_cred_free)(fido_cred_t **);
+extern size_t (*sym_fido_cred_id_len)(const fido_cred_t *);
+extern const unsigned char* (*sym_fido_cred_id_ptr)(const fido_cred_t *);
+extern fido_cred_t* (*sym_fido_cred_new)(void);
+extern int (*sym_fido_cred_set_clientdata_hash)(fido_cred_t *, const unsigned char *, size_t);
+extern int (*sym_fido_cred_set_extensions)(fido_cred_t *, int);
+extern int (*sym_fido_cred_set_rk)(fido_cred_t *, fido_opt_t);
+extern int (*sym_fido_cred_set_rp)(fido_cred_t *, const char *, const char *);
+extern int (*sym_fido_cred_set_type)(fido_cred_t *, int);
+extern int (*sym_fido_cred_set_user)(fido_cred_t *, const unsigned char *, size_t, const char *, const char *, const char *);
+extern int (*sym_fido_cred_set_uv)(fido_cred_t *, fido_opt_t);
+extern void (*sym_fido_dev_free)(fido_dev_t **);
+extern int (*sym_fido_dev_get_assert)(fido_dev_t *, fido_assert_t *, const char *);
+extern int (*sym_fido_dev_get_cbor_info)(fido_dev_t *, fido_cbor_info_t *);
+extern void (*sym_fido_dev_info_free)(fido_dev_info_t **, size_t);
+extern int (*sym_fido_dev_info_manifest)(fido_dev_info_t *, size_t, size_t *);
+extern const char* (*sym_fido_dev_info_manufacturer_string)(const fido_dev_info_t *);
+extern const char* (*sym_fido_dev_info_product_string)(const fido_dev_info_t *);
+extern fido_dev_info_t* (*sym_fido_dev_info_new)(size_t);
+extern const char* (*sym_fido_dev_info_path)(const fido_dev_info_t *);
+extern const fido_dev_info_t* (*sym_fido_dev_info_ptr)(const fido_dev_info_t *, size_t);
+extern bool (*sym_fido_dev_is_fido2)(const fido_dev_t *);
+extern int (*sym_fido_dev_make_cred)(fido_dev_t *, fido_cred_t *, const char *);
+extern fido_dev_t* (*sym_fido_dev_new)(void);
+extern int (*sym_fido_dev_open)(fido_dev_t *, const char *);
+extern int (*sym_fido_dev_close)(fido_dev_t *);
+extern void (*sym_fido_init)(int);
+extern void (*sym_fido_set_log_handler)(fido_log_handler_t *);
+extern const char* (*sym_fido_strerr)(int);
+
+int dlopen_libfido2(void);
+
+static inline void fido_cbor_info_free_wrapper(fido_cbor_info_t **p) {
+ if (*p)
+ sym_fido_cbor_info_free(p);
+}
+
+static inline void fido_assert_free_wrapper(fido_assert_t **p) {
+ if (*p)
+ sym_fido_assert_free(p);
+}
+
+static inline void fido_dev_free_wrapper(fido_dev_t **p) {
+ if (*p) {
+ sym_fido_dev_close(*p);
+ sym_fido_dev_free(p);
+ }
+}
+
+static inline void fido_cred_free_wrapper(fido_cred_t **p) {
+ if (*p)
+ sym_fido_cred_free(p);
+}
+
+int fido2_use_hmac_hash(
+ const char *device,
+ const char *rp_id,
+ const void *salt,
+ size_t salt_size,
+ const void *cid,
+ size_t cid_size,
+ char **pins,
+ Fido2EnrollFlags required,
+ void **ret_hmac,
+ size_t *ret_hmac_size);
+
+int fido2_generate_hmac_hash(
+ const char *device,
+ const char *rp_id,
+ const char *rp_name,
+ const void *user_id, size_t user_id_len,
+ const char *user_name,
+ const char *user_display_name,
+ const char *user_icon,
+ const char *askpw_icon_name,
+ Fido2EnrollFlags lock_with,
+ int cred_alg,
+ void **ret_cid, size_t *ret_cid_size,
+ void **ret_salt, size_t *ret_salt_size,
+ void **ret_secret, size_t *ret_secret_size,
+ char **ret_usedpin,
+ Fido2EnrollFlags *ret_locked_with);
+
+int parse_fido2_algorithm(const char *s, int *ret);
+#else
+static inline int parse_fido2_algorithm(const char *s, int *ret) {
+ return -EOPNOTSUPP;
+}
+#endif
+
+int fido2_list_devices(void);
+int fido2_find_device_auto(char **ret);
+
+int fido2_have_device(const char *device);
diff --git a/src/shared/libmount-util.c b/src/shared/libmount-util.c
new file mode 100644
index 0000000..3818904
--- /dev/null
+++ b/src/shared/libmount-util.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdio.h>
+
+#include "libmount-util.h"
+
+int libmount_parse(
+ const char *path,
+ FILE *source,
+ struct libmnt_table **ret_table,
+ struct libmnt_iter **ret_iter) {
+
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+ int r;
+
+ /* Older libmount seems to require this. */
+ assert(!source || path);
+
+ table = mnt_new_table();
+ iter = mnt_new_iter(MNT_ITER_FORWARD);
+ if (!table || !iter)
+ return -ENOMEM;
+
+ /* If source or path are specified, we use on the functions which ignore utab.
+ * Only if both are empty, we use mnt_table_parse_mtab(). */
+
+ if (source)
+ r = mnt_table_parse_stream(table, source, path);
+ else if (path)
+ r = mnt_table_parse_file(table, path);
+ else
+ r = mnt_table_parse_mtab(table, NULL);
+ if (r < 0)
+ return r;
+
+ *ret_table = TAKE_PTR(table);
+ *ret_iter = TAKE_PTR(iter);
+ return 0;
+}
+
+int libmount_is_leaf(
+ struct libmnt_table *table,
+ struct libmnt_fs *fs) {
+ int r;
+
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter_children = NULL;
+ iter_children = mnt_new_iter(MNT_ITER_FORWARD);
+ if (!iter_children)
+ return log_oom();
+
+ /* We care only whether it exists, it is unused */
+ _unused_ struct libmnt_fs *child;
+ r = mnt_table_next_child_fs(table, iter_children, fs, &child);
+ if (r < 0)
+ return r;
+
+ return r == 1;
+}
diff --git a/src/shared/libmount-util.h b/src/shared/libmount-util.h
new file mode 100644
index 0000000..2f789e7
--- /dev/null
+++ b/src/shared/libmount-util.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* This needs to be after sys/mount.h */
+#include <libmount.h>
+
+#include "macro.h"
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct libmnt_table*, mnt_free_table, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct libmnt_iter*, mnt_free_iter, NULL);
+
+int libmount_parse(
+ const char *path,
+ FILE *source,
+ struct libmnt_table **ret_table,
+ struct libmnt_iter **ret_iter);
+
+int libmount_is_leaf(
+ struct libmnt_table *table,
+ struct libmnt_fs *fs);
diff --git a/src/shared/libshared.sym b/src/shared/libshared.sym
new file mode 100644
index 0000000..6a7495a
--- /dev/null
+++ b/src/shared/libshared.sym
@@ -0,0 +1,3 @@
+SD_SHARED {
+ global: *;
+};
diff --git a/src/shared/linux/README b/src/shared/linux/README
new file mode 100644
index 0000000..34fc09b
--- /dev/null
+++ b/src/shared/linux/README
@@ -0,0 +1,9 @@
+The files in this directory are copied from kernel-6.2, and the following modifications are applied:
+- auto_dev-ioctl.h: set AUTOFS_DEV_IOCTL_VERSION_MINOR to 0
+- auto_dev-ioctl.h: define AUTOFS_IOCTL if not defined
+- auto_dev-ioctl.h: use of fake flexible array is fixed
+- bpf_insn.h: This is imported from samples/bpf/bpf_insn.h
+- bpf_insn.h: BPF_JMP_A() macro is also imported from include/linux/filter.h
+- dm-ioctl.h: set DM_VERSION_MINOR to 27
+- ethtool.h: define __KERNEL_DIV_ROUND_UP if not defined
+- ethtool.h: add casts in ethtool_cmd_speed()
diff --git a/src/shared/linux/auto_dev-ioctl.h b/src/shared/linux/auto_dev-ioctl.h
new file mode 100644
index 0000000..c6b7e11
--- /dev/null
+++ b/src/shared/linux/auto_dev-ioctl.h
@@ -0,0 +1,220 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright 2008 Red Hat, Inc. All rights reserved.
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ *
+ * This file is part of the Linux kernel and is made available under
+ * the terms of the GNU General Public License, version 2, or at your
+ * option, any later version, incorporated herein by reference.
+ */
+
+#ifndef _LINUX_AUTO_DEV_IOCTL_H
+#define _LINUX_AUTO_DEV_IOCTL_H
+
+#include <linux/auto_fs.h>
+#include <linux/string.h>
+
+#define AUTOFS_DEVICE_NAME "autofs"
+
+#define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1
+#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0
+
+#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
+
+/*
+ * An ioctl interface for autofs mount point control.
+ */
+
+struct args_protover {
+ __u32 version;
+};
+
+struct args_protosubver {
+ __u32 sub_version;
+};
+
+struct args_openmount {
+ __u32 devid;
+};
+
+struct args_ready {
+ __u32 token;
+};
+
+struct args_fail {
+ __u32 token;
+ __s32 status;
+};
+
+struct args_setpipefd {
+ __s32 pipefd;
+};
+
+struct args_timeout {
+ __u64 timeout;
+};
+
+struct args_requester {
+ __u32 uid;
+ __u32 gid;
+};
+
+struct args_expire {
+ __u32 how;
+};
+
+struct args_askumount {
+ __u32 may_umount;
+};
+
+struct args_ismountpoint {
+ union {
+ struct args_in {
+ __u32 type;
+ } in;
+ struct args_out {
+ __u32 devid;
+ __u32 magic;
+ } out;
+ };
+};
+
+/*
+ * All the ioctls use this structure.
+ * When sending a path size must account for the total length
+ * of the chunk of memory otherwise it is the size of the
+ * structure.
+ */
+
+struct autofs_dev_ioctl {
+ __u32 ver_major;
+ __u32 ver_minor;
+ __u32 size; /* total size of data passed in
+ * including this struct */
+ __s32 ioctlfd; /* automount command fd */
+
+ /* Command parameters */
+
+ union {
+ struct args_protover protover;
+ struct args_protosubver protosubver;
+ struct args_openmount openmount;
+ struct args_ready ready;
+ struct args_fail fail;
+ struct args_setpipefd setpipefd;
+ struct args_timeout timeout;
+ struct args_requester requester;
+ struct args_expire expire;
+ struct args_askumount askumount;
+ struct args_ismountpoint ismountpoint;
+ };
+
+ char path[];
+};
+
+static __inline__ void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
+{
+ memset(in, 0, AUTOFS_DEV_IOCTL_SIZE);
+ in->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR;
+ in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
+ in->size = AUTOFS_DEV_IOCTL_SIZE;
+ in->ioctlfd = -1;
+}
+
+enum {
+ /* Get various version info */
+ AUTOFS_DEV_IOCTL_VERSION_CMD = 0x71,
+ AUTOFS_DEV_IOCTL_PROTOVER_CMD,
+ AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD,
+
+ /* Open mount ioctl fd */
+ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD,
+
+ /* Close mount ioctl fd */
+ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD,
+
+ /* Mount/expire status returns */
+ AUTOFS_DEV_IOCTL_READY_CMD,
+ AUTOFS_DEV_IOCTL_FAIL_CMD,
+
+ /* Activate/deactivate autofs mount */
+ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD,
+ AUTOFS_DEV_IOCTL_CATATONIC_CMD,
+
+ /* Expiry timeout */
+ AUTOFS_DEV_IOCTL_TIMEOUT_CMD,
+
+ /* Get mount last requesting uid and gid */
+ AUTOFS_DEV_IOCTL_REQUESTER_CMD,
+
+ /* Check for eligible expire candidates */
+ AUTOFS_DEV_IOCTL_EXPIRE_CMD,
+
+ /* Request busy status */
+ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD,
+
+ /* Check if path is a mountpoint */
+ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD,
+};
+
+#ifndef AUTOFS_IOCTL
+#define AUTOFS_IOCTL 0x93
+#endif
+
+#define AUTOFS_DEV_IOCTL_VERSION \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_VERSION_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_PROTOVER \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_PROTOVER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_PROTOSUBVER \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_OPENMOUNT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_OPENMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_CLOSEMOUNT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_READY \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_READY_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_FAIL \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_FAIL_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_SETPIPEFD \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_SETPIPEFD_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_CATATONIC \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_CATATONIC_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_TIMEOUT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_TIMEOUT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_REQUESTER \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_REQUESTER_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_EXPIRE \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_EXPIRE_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_ASKUMOUNT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD, struct autofs_dev_ioctl)
+
+#define AUTOFS_DEV_IOCTL_ISMOUNTPOINT \
+ _IOWR(AUTOFS_IOCTL, \
+ AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD, struct autofs_dev_ioctl)
+
+#endif /* _LINUX_AUTO_DEV_IOCTL_H */
diff --git a/src/shared/linux/bpf.h b/src/shared/linux/bpf.h
new file mode 100644
index 0000000..9f8af5e
--- /dev/null
+++ b/src/shared/linux/bpf.h
@@ -0,0 +1,7053 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __LINUX_BPF_H__
+#define __LINUX_BPF_H__
+
+#include <linux/types.h>
+#include <linux/bpf_common.h>
+
+/* Extended instruction set based on top of classic BPF */
+
+/* instruction classes */
+#define BPF_JMP32 0x06 /* jmp mode in word width */
+#define BPF_ALU64 0x07 /* alu mode in double word width */
+
+/* ld/ldx fields */
+#define BPF_DW 0x18 /* double word (64-bit) */
+#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */
+#define BPF_XADD 0xc0 /* exclusive add - legacy name */
+
+/* alu/jmp fields */
+#define BPF_MOV 0xb0 /* mov reg to reg */
+#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */
+
+/* change endianness of a register */
+#define BPF_END 0xd0 /* flags for endianness conversion: */
+#define BPF_TO_LE 0x00 /* convert to little-endian */
+#define BPF_TO_BE 0x08 /* convert to big-endian */
+#define BPF_FROM_LE BPF_TO_LE
+#define BPF_FROM_BE BPF_TO_BE
+
+/* jmp encodings */
+#define BPF_JNE 0x50 /* jump != */
+#define BPF_JLT 0xa0 /* LT is unsigned, '<' */
+#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */
+#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */
+#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */
+#define BPF_JSLT 0xc0 /* SLT is signed, '<' */
+#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */
+#define BPF_CALL 0x80 /* function call */
+#define BPF_EXIT 0x90 /* function return */
+
+/* atomic op type fields (stored in immediate) */
+#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */
+#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */
+#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */
+
+/* Register numbers */
+enum {
+ BPF_REG_0 = 0,
+ BPF_REG_1,
+ BPF_REG_2,
+ BPF_REG_3,
+ BPF_REG_4,
+ BPF_REG_5,
+ BPF_REG_6,
+ BPF_REG_7,
+ BPF_REG_8,
+ BPF_REG_9,
+ BPF_REG_10,
+ __MAX_BPF_REG,
+};
+
+/* BPF has 10 general purpose 64-bit registers and stack frame. */
+#define MAX_BPF_REG __MAX_BPF_REG
+
+struct bpf_insn {
+ __u8 code; /* opcode */
+ __u8 dst_reg:4; /* dest register */
+ __u8 src_reg:4; /* source register */
+ __s16 off; /* signed offset */
+ __s32 imm; /* signed immediate constant */
+};
+
+/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
+struct bpf_lpm_trie_key {
+ __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
+ __u8 data[0]; /* Arbitrary size */
+};
+
+struct bpf_cgroup_storage_key {
+ __u64 cgroup_inode_id; /* cgroup inode id */
+ __u32 attach_type; /* program attach type (enum bpf_attach_type) */
+};
+
+enum bpf_cgroup_iter_order {
+ BPF_CGROUP_ITER_ORDER_UNSPEC = 0,
+ BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */
+ BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */
+ BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */
+ BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */
+};
+
+union bpf_iter_link_info {
+ struct {
+ __u32 map_fd;
+ } map;
+ struct {
+ enum bpf_cgroup_iter_order order;
+
+ /* At most one of cgroup_fd and cgroup_id can be non-zero. If
+ * both are zero, the walk starts from the default cgroup v2
+ * root. For walking v1 hierarchy, one should always explicitly
+ * specify cgroup_fd.
+ */
+ __u32 cgroup_fd;
+ __u64 cgroup_id;
+ } cgroup;
+ /* Parameters of task iterators. */
+ struct {
+ __u32 tid;
+ __u32 pid;
+ __u32 pid_fd;
+ } task;
+};
+
+/* BPF syscall commands, see bpf(2) man-page for more details. */
+/**
+ * DOC: eBPF Syscall Preamble
+ *
+ * The operation to be performed by the **bpf**\ () system call is determined
+ * by the *cmd* argument. Each operation takes an accompanying argument,
+ * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
+ * below). The size argument is the size of the union pointed to by *attr*.
+ */
+/**
+ * DOC: eBPF Syscall Commands
+ *
+ * BPF_MAP_CREATE
+ * Description
+ * Create a map and return a file descriptor that refers to the
+ * map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
+ * is automatically enabled for the new file descriptor.
+ *
+ * Applying **close**\ (2) to the file descriptor returned by
+ * **BPF_MAP_CREATE** will delete the map (but see NOTES).
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_LOOKUP_ELEM
+ * Description
+ * Look up an element with a given *key* in the map referred to
+ * by the file descriptor *map_fd*.
+ *
+ * The *flags* argument may be specified as one of the
+ * following:
+ *
+ * **BPF_F_LOCK**
+ * Look up the value of a spin-locked map without
+ * returning the lock. This must be specified if the
+ * elements contain a spinlock.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_UPDATE_ELEM
+ * Description
+ * Create or update an element (key/value pair) in a specified map.
+ *
+ * The *flags* argument should be specified as one of the
+ * following:
+ *
+ * **BPF_ANY**
+ * Create a new element or update an existing element.
+ * **BPF_NOEXIST**
+ * Create a new element only if it did not exist.
+ * **BPF_EXIST**
+ * Update an existing element.
+ * **BPF_F_LOCK**
+ * Update a spin_lock-ed map element.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
+ * **E2BIG**, **EEXIST**, or **ENOENT**.
+ *
+ * **E2BIG**
+ * The number of elements in the map reached the
+ * *max_entries* limit specified at map creation time.
+ * **EEXIST**
+ * If *flags* specifies **BPF_NOEXIST** and the element
+ * with *key* already exists in the map.
+ * **ENOENT**
+ * If *flags* specifies **BPF_EXIST** and the element with
+ * *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_ELEM
+ * Description
+ * Look up and delete an element by key in a specified map.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_KEY
+ * Description
+ * Look up an element by key in a specified map and return the key
+ * of the next element. Can be used to iterate over all elements
+ * in the map.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * The following cases can be used to iterate over all elements of
+ * the map:
+ *
+ * * If *key* is not found, the operation returns zero and sets
+ * the *next_key* pointer to the key of the first element.
+ * * If *key* is found, the operation returns zero and sets the
+ * *next_key* pointer to the key of the next element.
+ * * If *key* is the last element, returns -1 and *errno* is set
+ * to **ENOENT**.
+ *
+ * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
+ * **EINVAL** on error.
+ *
+ * BPF_PROG_LOAD
+ * Description
+ * Verify and load an eBPF program, returning a new file
+ * descriptor associated with the program.
+ *
+ * Applying **close**\ (2) to the file descriptor returned by
+ * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
+ *
+ * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
+ * automatically enabled for the new file descriptor.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_PIN
+ * Description
+ * Pin an eBPF program or map referred by the specified *bpf_fd*
+ * to the provided *pathname* on the filesystem.
+ *
+ * The *pathname* argument must not contain a dot (".").
+ *
+ * On success, *pathname* retains a reference to the eBPF object,
+ * preventing deallocation of the object when the original
+ * *bpf_fd* is closed. This allow the eBPF object to live beyond
+ * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
+ * process.
+ *
+ * Applying **unlink**\ (2) or similar calls to the *pathname*
+ * unpins the object from the filesystem, removing the reference.
+ * If no other file descriptors or filesystem nodes refer to the
+ * same object, it will be deallocated (see NOTES).
+ *
+ * The filesystem type for the parent directory of *pathname* must
+ * be **BPF_FS_MAGIC**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_OBJ_GET
+ * Description
+ * Open a file descriptor for the eBPF object pinned to the
+ * specified *pathname*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_PROG_ATTACH
+ * Description
+ * Attach an eBPF program to a *target_fd* at the specified
+ * *attach_type* hook.
+ *
+ * The *attach_type* specifies the eBPF attachment point to
+ * attach the program to, and must be one of *bpf_attach_type*
+ * (see below).
+ *
+ * The *attach_bpf_fd* must be a valid file descriptor for a
+ * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
+ * or sock_ops type corresponding to the specified *attach_type*.
+ *
+ * The *target_fd* must be a valid file descriptor for a kernel
+ * object which depends on the attach type of *attach_bpf_fd*:
+ *
+ * **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ * **BPF_PROG_TYPE_CGROUP_SKB**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ * **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ * **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ * **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ * Control Group v2 hierarchy with the eBPF controller
+ * enabled. Requires the kernel to be compiled with
+ * **CONFIG_CGROUP_BPF**.
+ *
+ * **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ * Network namespace (eg /proc/self/ns/net).
+ *
+ * **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ * LIRC device path (eg /dev/lircN). Requires the kernel
+ * to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ * **BPF_PROG_TYPE_SK_SKB**,
+ * **BPF_PROG_TYPE_SK_MSG**
+ *
+ * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_DETACH
+ * Description
+ * Detach the eBPF program associated with the *target_fd* at the
+ * hook specified by *attach_type*. The program must have been
+ * previously attached using **BPF_PROG_ATTACH**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_TEST_RUN
+ * Description
+ * Run the eBPF program associated with the *prog_fd* a *repeat*
+ * number of times against a provided program context *ctx_in* and
+ * data *data_in*, and return the modified program context
+ * *ctx_out*, *data_out* (for example, packet data), result of the
+ * execution *retval*, and *duration* of the test run.
+ *
+ * The sizes of the buffers provided as input and output
+ * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must
+ * be provided in the corresponding variables *ctx_size_in*,
+ * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any
+ * of these parameters are not provided (ie set to NULL), the
+ * corresponding size field must be zero.
+ *
+ * Some program types have particular requirements:
+ *
+ * **BPF_PROG_TYPE_SK_LOOKUP**
+ * *data_in* and *data_out* must be NULL.
+ *
+ * **BPF_PROG_TYPE_RAW_TRACEPOINT**,
+ * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE**
+ *
+ * *ctx_out*, *data_in* and *data_out* must be NULL.
+ * *repeat* must be zero.
+ *
+ * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * **ENOSPC**
+ * Either *data_size_out* or *ctx_size_out* is too small.
+ * **ENOTSUPP**
+ * This command is not supported by the program type of
+ * the program referred to by *prog_fd*.
+ *
+ * BPF_PROG_GET_NEXT_ID
+ * Description
+ * Fetch the next eBPF program currently loaded into the kernel.
+ *
+ * Looks for the eBPF program with an id greater than *start_id*
+ * and updates *next_id* on success. If no other eBPF programs
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_GET_NEXT_ID
+ * Description
+ * Fetch the next eBPF map currently loaded into the kernel.
+ *
+ * Looks for the eBPF map with an id greater than *start_id*
+ * and updates *next_id* on success. If no other eBPF maps
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_PROG_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the eBPF program corresponding to
+ * *prog_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_MAP_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the eBPF map corresponding to
+ * *map_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_OBJ_GET_INFO_BY_FD
+ * Description
+ * Obtain information about the eBPF object corresponding to
+ * *bpf_fd*.
+ *
+ * Populates up to *info_len* bytes of *info*, which will be in
+ * one of the following formats depending on the eBPF object type
+ * of *bpf_fd*:
+ *
+ * * **struct bpf_prog_info**
+ * * **struct bpf_map_info**
+ * * **struct bpf_btf_info**
+ * * **struct bpf_link_info**
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_QUERY
+ * Description
+ * Obtain information about eBPF programs associated with the
+ * specified *attach_type* hook.
+ *
+ * The *target_fd* must be a valid file descriptor for a kernel
+ * object which depends on the attach type of *attach_bpf_fd*:
+ *
+ * **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ * **BPF_PROG_TYPE_CGROUP_SKB**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK**,
+ * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ * **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ * **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ * **BPF_PROG_TYPE_SOCK_OPS**
+ *
+ * Control Group v2 hierarchy with the eBPF controller
+ * enabled. Requires the kernel to be compiled with
+ * **CONFIG_CGROUP_BPF**.
+ *
+ * **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ *
+ * Network namespace (eg /proc/self/ns/net).
+ *
+ * **BPF_PROG_TYPE_LIRC_MODE2**
+ *
+ * LIRC device path (eg /dev/lircN). Requires the kernel
+ * to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ *
+ * **BPF_PROG_QUERY** always fetches the number of programs
+ * attached and the *attach_flags* which were used to attach those
+ * programs. Additionally, if *prog_ids* is nonzero and the number
+ * of attached programs is less than *prog_cnt*, populates
+ * *prog_ids* with the eBPF program ids of the programs attached
+ * at *target_fd*.
+ *
+ * The following flags may alter the result:
+ *
+ * **BPF_F_QUERY_EFFECTIVE**
+ * Only return information regarding programs which are
+ * currently effective at the specified *target_fd*.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_RAW_TRACEPOINT_OPEN
+ * Description
+ * Attach an eBPF program to a tracepoint *name* to access kernel
+ * internal arguments of the tracepoint in their raw form.
+ *
+ * The *prog_fd* must be a valid file descriptor associated with
+ * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+ *
+ * No ABI guarantees are made about the content of tracepoint
+ * arguments exposed to the corresponding eBPF program.
+ *
+ * Applying **close**\ (2) to the file descriptor returned by
+ * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_LOAD
+ * Description
+ * Verify and load BPF Type Format (BTF) metadata into the kernel,
+ * returning a new file descriptor associated with the metadata.
+ * BTF is described in more detail at
+ * https://www.kernel.org/doc/html/latest/bpf/btf.html.
+ *
+ * The *btf* parameter must point to valid memory providing
+ * *btf_size* bytes of BTF binary metadata.
+ *
+ * The returned file descriptor can be passed to other **bpf**\ ()
+ * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
+ * associate the BTF with those objects.
+ *
+ * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
+ * parameters to specify a *btf_log_buf*, *btf_log_size* and
+ * *btf_log_level* which allow the kernel to return freeform log
+ * output regarding the BTF verification process.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_BTF_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the BPF Type Format (BTF)
+ * corresponding to *btf_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_TASK_FD_QUERY
+ * Description
+ * Obtain information about eBPF programs associated with the
+ * target process identified by *pid* and *fd*.
+ *
+ * If the *pid* and *fd* are associated with a tracepoint, kprobe
+ * or uprobe perf event, then the *prog_id* and *fd_type* will
+ * be populated with the eBPF program id and file descriptor type
+ * of type **bpf_task_fd_type**. If associated with a kprobe or
+ * uprobe, the *probe_offset* and *probe_addr* will also be
+ * populated. Optionally, if *buf* is provided, then up to
+ * *buf_len* bytes of *buf* will be populated with the name of
+ * the tracepoint, kprobe or uprobe.
+ *
+ * The resulting *prog_id* may be introspected in deeper detail
+ * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_ELEM
+ * Description
+ * Look up an element with the given *key* in the map referred to
+ * by the file descriptor *fd*, and if found, delete the element.
+ *
+ * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map
+ * types, the *flags* argument needs to be set to 0, but for other
+ * map types, it may be specified as:
+ *
+ * **BPF_F_LOCK**
+ * Look up and delete the value of a spin-locked map
+ * without returning the lock. This must be specified if
+ * the elements contain a spinlock.
+ *
+ * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
+ * implement this command as a "pop" operation, deleting the top
+ * element rather than one corresponding to *key*.
+ * The *key* and *key_len* parameters should be zeroed when
+ * issuing this operation for these map types.
+ *
+ * This command is only valid for the following map types:
+ * * **BPF_MAP_TYPE_QUEUE**
+ * * **BPF_MAP_TYPE_STACK**
+ * * **BPF_MAP_TYPE_HASH**
+ * * **BPF_MAP_TYPE_PERCPU_HASH**
+ * * **BPF_MAP_TYPE_LRU_HASH**
+ * * **BPF_MAP_TYPE_LRU_PERCPU_HASH**
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_FREEZE
+ * Description
+ * Freeze the permissions of the specified map.
+ *
+ * Write permissions may be frozen by passing zero *flags*.
+ * Upon success, no future syscall invocations may alter the
+ * map state of *map_fd*. Write operations from eBPF programs
+ * are still possible for a frozen map.
+ *
+ * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_BTF_GET_NEXT_ID
+ * Description
+ * Fetch the next BPF Type Format (BTF) object currently loaded
+ * into the kernel.
+ *
+ * Looks for the BTF object with an id greater than *start_id*
+ * and updates *next_id* on success. If no other BTF objects
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_MAP_LOOKUP_BATCH
+ * Description
+ * Iterate and fetch multiple elements in a map.
+ *
+ * Two opaque values are used to manage batch operations,
+ * *in_batch* and *out_batch*. Initially, *in_batch* must be set
+ * to NULL to begin the batched operation. After each subsequent
+ * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
+ * *out_batch* as the *in_batch* for the next operation to
+ * continue iteration from the current point.
+ *
+ * The *keys* and *values* are output parameters which must point
+ * to memory large enough to hold *count* items based on the key
+ * and value size of the map *map_fd*. The *keys* buffer must be
+ * of *key_size* * *count*. The *values* buffer must be of
+ * *value_size* * *count*.
+ *
+ * The *elem_flags* argument may be specified as one of the
+ * following:
+ *
+ * **BPF_F_LOCK**
+ * Look up the value of a spin-locked map without
+ * returning the lock. This must be specified if the
+ * elements contain a spinlock.
+ *
+ * On success, *count* elements from the map are copied into the
+ * user buffer, with the keys copied into *keys* and the values
+ * copied into the corresponding indices in *values*.
+ *
+ * If an error is returned and *errno* is not **EFAULT**, *count*
+ * is set to the number of successfully processed elements.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * May set *errno* to **ENOSPC** to indicate that *keys* or
+ * *values* is too small to dump an entire bucket during
+ * iteration of a hash-based map type.
+ *
+ * BPF_MAP_LOOKUP_AND_DELETE_BATCH
+ * Description
+ * Iterate and delete all elements in a map.
+ *
+ * This operation has the same behavior as
+ * **BPF_MAP_LOOKUP_BATCH** with two exceptions:
+ *
+ * * Every element that is successfully returned is also deleted
+ * from the map. This is at least *count* elements. Note that
+ * *count* is both an input and an output parameter.
+ * * Upon returning with *errno* set to **EFAULT**, up to
+ * *count* elements may be deleted without returning the keys
+ * and values of the deleted elements.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_MAP_UPDATE_BATCH
+ * Description
+ * Update multiple elements in a map by *key*.
+ *
+ * The *keys* and *values* are input parameters which must point
+ * to memory large enough to hold *count* items based on the key
+ * and value size of the map *map_fd*. The *keys* buffer must be
+ * of *key_size* * *count*. The *values* buffer must be of
+ * *value_size* * *count*.
+ *
+ * Each element specified in *keys* is sequentially updated to the
+ * value in the corresponding index in *values*. The *in_batch*
+ * and *out_batch* parameters are ignored and should be zeroed.
+ *
+ * The *elem_flags* argument should be specified as one of the
+ * following:
+ *
+ * **BPF_ANY**
+ * Create new elements or update a existing elements.
+ * **BPF_NOEXIST**
+ * Create new elements only if they do not exist.
+ * **BPF_EXIST**
+ * Update existing elements.
+ * **BPF_F_LOCK**
+ * Update spin_lock-ed map elements. This must be
+ * specified if the map value contains a spinlock.
+ *
+ * On success, *count* elements from the map are updated.
+ *
+ * If an error is returned and *errno* is not **EFAULT**, *count*
+ * is set to the number of successfully processed elements.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
+ * **E2BIG**. **E2BIG** indicates that the number of elements in
+ * the map reached the *max_entries* limit specified at map
+ * creation time.
+ *
+ * May set *errno* to one of the following error codes under
+ * specific circumstances:
+ *
+ * **EEXIST**
+ * If *flags* specifies **BPF_NOEXIST** and the element
+ * with *key* already exists in the map.
+ * **ENOENT**
+ * If *flags* specifies **BPF_EXIST** and the element with
+ * *key* does not exist in the map.
+ *
+ * BPF_MAP_DELETE_BATCH
+ * Description
+ * Delete multiple elements in a map by *key*.
+ *
+ * The *keys* parameter is an input parameter which must point
+ * to memory large enough to hold *count* items based on the key
+ * size of the map *map_fd*, that is, *key_size* * *count*.
+ *
+ * Each element specified in *keys* is sequentially deleted. The
+ * *in_batch*, *out_batch*, and *values* parameters are ignored
+ * and should be zeroed.
+ *
+ * The *elem_flags* argument may be specified as one of the
+ * following:
+ *
+ * **BPF_F_LOCK**
+ * Look up the value of a spin-locked map without
+ * returning the lock. This must be specified if the
+ * elements contain a spinlock.
+ *
+ * On success, *count* elements from the map are updated.
+ *
+ * If an error is returned and *errno* is not **EFAULT**, *count*
+ * is set to the number of successfully processed elements. If
+ * *errno* is **EFAULT**, up to *count* elements may be been
+ * deleted.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_LINK_CREATE
+ * Description
+ * Attach an eBPF program to a *target_fd* at the specified
+ * *attach_type* hook and return a file descriptor handle for
+ * managing the link.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_UPDATE
+ * Description
+ * Update the eBPF program in the specified *link_fd* to
+ * *new_prog_fd*.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_LINK_GET_FD_BY_ID
+ * Description
+ * Open a file descriptor for the eBPF Link corresponding to
+ * *link_id*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_GET_NEXT_ID
+ * Description
+ * Fetch the next eBPF link currently loaded into the kernel.
+ *
+ * Looks for the eBPF link with an id greater than *start_id*
+ * and updates *next_id* on success. If no other eBPF links
+ * remain with ids higher than *start_id*, returns -1 and sets
+ * *errno* to **ENOENT**.
+ *
+ * Return
+ * Returns zero on success. On error, or when no id remains, -1
+ * is returned and *errno* is set appropriately.
+ *
+ * BPF_ENABLE_STATS
+ * Description
+ * Enable eBPF runtime statistics gathering.
+ *
+ * Runtime statistics gathering for the eBPF runtime is disabled
+ * by default to minimize the corresponding performance overhead.
+ * This command enables statistics globally.
+ *
+ * Multiple programs may independently enable statistics.
+ * After gathering the desired statistics, eBPF runtime statistics
+ * may be disabled again by calling **close**\ (2) for the file
+ * descriptor returned by this function. Statistics will only be
+ * disabled system-wide when all outstanding file descriptors
+ * returned by prior calls for this subcommand are closed.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_ITER_CREATE
+ * Description
+ * Create an iterator on top of the specified *link_fd* (as
+ * previously created using **BPF_LINK_CREATE**) and return a
+ * file descriptor that can be used to trigger the iteration.
+ *
+ * If the resulting file descriptor is pinned to the filesystem
+ * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
+ * for that path will trigger the iterator to read kernel state
+ * using the eBPF program attached to *link_fd*.
+ *
+ * Return
+ * A new file descriptor (a nonnegative integer), or -1 if an
+ * error occurred (in which case, *errno* is set appropriately).
+ *
+ * BPF_LINK_DETACH
+ * Description
+ * Forcefully detach the specified *link_fd* from its
+ * corresponding attachment point.
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * BPF_PROG_BIND_MAP
+ * Description
+ * Bind a map to the lifetime of an eBPF program.
+ *
+ * The map identified by *map_fd* is bound to the program
+ * identified by *prog_fd* and only released when *prog_fd* is
+ * released. This may be used in cases where metadata should be
+ * associated with a program which otherwise does not contain any
+ * references to the map (for example, embedded in the eBPF
+ * program instructions).
+ *
+ * Return
+ * Returns zero on success. On error, -1 is returned and *errno*
+ * is set appropriately.
+ *
+ * NOTES
+ * eBPF objects (maps and programs) can be shared between processes.
+ *
+ * * After **fork**\ (2), the child inherits file descriptors
+ * referring to the same eBPF objects.
+ * * File descriptors referring to eBPF objects can be transferred over
+ * **unix**\ (7) domain sockets.
+ * * File descriptors referring to eBPF objects can be duplicated in the
+ * usual way, using **dup**\ (2) and similar calls.
+ * * File descriptors referring to eBPF objects can be pinned to the
+ * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
+ *
+ * An eBPF object is deallocated only after all file descriptors referring
+ * to the object have been closed and no references remain pinned to the
+ * filesystem or attached (for example, bound to a program or device).
+ */
+enum bpf_cmd {
+ BPF_MAP_CREATE,
+ BPF_MAP_LOOKUP_ELEM,
+ BPF_MAP_UPDATE_ELEM,
+ BPF_MAP_DELETE_ELEM,
+ BPF_MAP_GET_NEXT_KEY,
+ BPF_PROG_LOAD,
+ BPF_OBJ_PIN,
+ BPF_OBJ_GET,
+ BPF_PROG_ATTACH,
+ BPF_PROG_DETACH,
+ BPF_PROG_TEST_RUN,
+ BPF_PROG_RUN = BPF_PROG_TEST_RUN,
+ BPF_PROG_GET_NEXT_ID,
+ BPF_MAP_GET_NEXT_ID,
+ BPF_PROG_GET_FD_BY_ID,
+ BPF_MAP_GET_FD_BY_ID,
+ BPF_OBJ_GET_INFO_BY_FD,
+ BPF_PROG_QUERY,
+ BPF_RAW_TRACEPOINT_OPEN,
+ BPF_BTF_LOAD,
+ BPF_BTF_GET_FD_BY_ID,
+ BPF_TASK_FD_QUERY,
+ BPF_MAP_LOOKUP_AND_DELETE_ELEM,
+ BPF_MAP_FREEZE,
+ BPF_BTF_GET_NEXT_ID,
+ BPF_MAP_LOOKUP_BATCH,
+ BPF_MAP_LOOKUP_AND_DELETE_BATCH,
+ BPF_MAP_UPDATE_BATCH,
+ BPF_MAP_DELETE_BATCH,
+ BPF_LINK_CREATE,
+ BPF_LINK_UPDATE,
+ BPF_LINK_GET_FD_BY_ID,
+ BPF_LINK_GET_NEXT_ID,
+ BPF_ENABLE_STATS,
+ BPF_ITER_CREATE,
+ BPF_LINK_DETACH,
+ BPF_PROG_BIND_MAP,
+};
+
+enum bpf_map_type {
+ BPF_MAP_TYPE_UNSPEC,
+ BPF_MAP_TYPE_HASH,
+ BPF_MAP_TYPE_ARRAY,
+ BPF_MAP_TYPE_PROG_ARRAY,
+ BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ BPF_MAP_TYPE_PERCPU_HASH,
+ BPF_MAP_TYPE_PERCPU_ARRAY,
+ BPF_MAP_TYPE_STACK_TRACE,
+ BPF_MAP_TYPE_CGROUP_ARRAY,
+ BPF_MAP_TYPE_LRU_HASH,
+ BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ BPF_MAP_TYPE_LPM_TRIE,
+ BPF_MAP_TYPE_ARRAY_OF_MAPS,
+ BPF_MAP_TYPE_HASH_OF_MAPS,
+ BPF_MAP_TYPE_DEVMAP,
+ BPF_MAP_TYPE_SOCKMAP,
+ BPF_MAP_TYPE_CPUMAP,
+ BPF_MAP_TYPE_XSKMAP,
+ BPF_MAP_TYPE_SOCKHASH,
+ BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
+ /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching
+ * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to
+ * both cgroup-attached and other progs and supports all functionality
+ * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark
+ * BPF_MAP_TYPE_CGROUP_STORAGE deprecated.
+ */
+ BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED,
+ BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ BPF_MAP_TYPE_QUEUE,
+ BPF_MAP_TYPE_STACK,
+ BPF_MAP_TYPE_SK_STORAGE,
+ BPF_MAP_TYPE_DEVMAP_HASH,
+ BPF_MAP_TYPE_STRUCT_OPS,
+ BPF_MAP_TYPE_RINGBUF,
+ BPF_MAP_TYPE_INODE_STORAGE,
+ BPF_MAP_TYPE_TASK_STORAGE,
+ BPF_MAP_TYPE_BLOOM_FILTER,
+ BPF_MAP_TYPE_USER_RINGBUF,
+ BPF_MAP_TYPE_CGRP_STORAGE,
+};
+
+/* Note that tracing related programs such as
+ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
+ * are not subject to a stable API since kernel internal data
+ * structures can change from release to release and may
+ * therefore break existing tracing BPF programs. Tracing BPF
+ * programs correspond to /a/ specific kernel which is to be
+ * analyzed, and not /a/ specific kernel /and/ all future ones.
+ */
+enum bpf_prog_type {
+ BPF_PROG_TYPE_UNSPEC,
+ BPF_PROG_TYPE_SOCKET_FILTER,
+ BPF_PROG_TYPE_KPROBE,
+ BPF_PROG_TYPE_SCHED_CLS,
+ BPF_PROG_TYPE_SCHED_ACT,
+ BPF_PROG_TYPE_TRACEPOINT,
+ BPF_PROG_TYPE_XDP,
+ BPF_PROG_TYPE_PERF_EVENT,
+ BPF_PROG_TYPE_CGROUP_SKB,
+ BPF_PROG_TYPE_CGROUP_SOCK,
+ BPF_PROG_TYPE_LWT_IN,
+ BPF_PROG_TYPE_LWT_OUT,
+ BPF_PROG_TYPE_LWT_XMIT,
+ BPF_PROG_TYPE_SOCK_OPS,
+ BPF_PROG_TYPE_SK_SKB,
+ BPF_PROG_TYPE_CGROUP_DEVICE,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_RAW_TRACEPOINT,
+ BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ BPF_PROG_TYPE_LWT_SEG6LOCAL,
+ BPF_PROG_TYPE_LIRC_MODE2,
+ BPF_PROG_TYPE_SK_REUSEPORT,
+ BPF_PROG_TYPE_FLOW_DISSECTOR,
+ BPF_PROG_TYPE_CGROUP_SYSCTL,
+ BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ BPF_PROG_TYPE_TRACING,
+ BPF_PROG_TYPE_STRUCT_OPS,
+ BPF_PROG_TYPE_EXT,
+ BPF_PROG_TYPE_LSM,
+ BPF_PROG_TYPE_SK_LOOKUP,
+ BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+};
+
+enum bpf_attach_type {
+ BPF_CGROUP_INET_INGRESS,
+ BPF_CGROUP_INET_EGRESS,
+ BPF_CGROUP_INET_SOCK_CREATE,
+ BPF_CGROUP_SOCK_OPS,
+ BPF_SK_SKB_STREAM_PARSER,
+ BPF_SK_SKB_STREAM_VERDICT,
+ BPF_CGROUP_DEVICE,
+ BPF_SK_MSG_VERDICT,
+ BPF_CGROUP_INET4_BIND,
+ BPF_CGROUP_INET6_BIND,
+ BPF_CGROUP_INET4_CONNECT,
+ BPF_CGROUP_INET6_CONNECT,
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_UDP4_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_LIRC_MODE2,
+ BPF_FLOW_DISSECTOR,
+ BPF_CGROUP_SYSCTL,
+ BPF_CGROUP_UDP4_RECVMSG,
+ BPF_CGROUP_UDP6_RECVMSG,
+ BPF_CGROUP_GETSOCKOPT,
+ BPF_CGROUP_SETSOCKOPT,
+ BPF_TRACE_RAW_TP,
+ BPF_TRACE_FENTRY,
+ BPF_TRACE_FEXIT,
+ BPF_MODIFY_RETURN,
+ BPF_LSM_MAC,
+ BPF_TRACE_ITER,
+ BPF_CGROUP_INET4_GETPEERNAME,
+ BPF_CGROUP_INET6_GETPEERNAME,
+ BPF_CGROUP_INET4_GETSOCKNAME,
+ BPF_CGROUP_INET6_GETSOCKNAME,
+ BPF_XDP_DEVMAP,
+ BPF_CGROUP_INET_SOCK_RELEASE,
+ BPF_XDP_CPUMAP,
+ BPF_SK_LOOKUP,
+ BPF_XDP,
+ BPF_SK_SKB_VERDICT,
+ BPF_SK_REUSEPORT_SELECT,
+ BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
+ BPF_PERF_EVENT,
+ BPF_TRACE_KPROBE_MULTI,
+ BPF_LSM_CGROUP,
+ __MAX_BPF_ATTACH_TYPE
+};
+
+#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
+
+enum bpf_link_type {
+ BPF_LINK_TYPE_UNSPEC = 0,
+ BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
+ BPF_LINK_TYPE_TRACING = 2,
+ BPF_LINK_TYPE_CGROUP = 3,
+ BPF_LINK_TYPE_ITER = 4,
+ BPF_LINK_TYPE_NETNS = 5,
+ BPF_LINK_TYPE_XDP = 6,
+ BPF_LINK_TYPE_PERF_EVENT = 7,
+ BPF_LINK_TYPE_KPROBE_MULTI = 8,
+ BPF_LINK_TYPE_STRUCT_OPS = 9,
+
+ MAX_BPF_LINK_TYPE,
+};
+
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of
+ * programs for a cgroup. Though it's possible to replace an old program at
+ * any position by also specifying BPF_F_REPLACE flag and position itself in
+ * replace_bpf_fd attribute. Old program at this position will be released.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ * cgrp2 (OVERRIDE prog C) ->
+ * cgrp3 (MULTI prog D) ->
+ * cgrp4 (OVERRIDE prog E) ->
+ * cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
+ */
+#define BPF_F_ALLOW_OVERRIDE (1U << 0)
+#define BPF_F_ALLOW_MULTI (1U << 1)
+#define BPF_F_REPLACE (1U << 2)
+
+/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
+ * verifier will perform strict alignment checking as if the kernel
+ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set,
+ * and NET_IP_ALIGN defined to 2.
+ */
+#define BPF_F_STRICT_ALIGNMENT (1U << 0)
+
+/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the
+ * verifier will allow any alignment whatsoever. On platforms
+ * with strict alignment requirements for loads ands stores (such
+ * as sparc and mips) the verifier validates that all loads and
+ * stores provably follow this requirement. This flag turns that
+ * checking and enforcement off.
+ *
+ * It is mostly used for testing when we want to validate the
+ * context and memory access aspects of the verifier, but because
+ * of an unaligned access the alignment check would trigger before
+ * the one we are interested in.
+ */
+#define BPF_F_ANY_ALIGNMENT (1U << 1)
+
+/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose.
+ * Verifier does sub-register def/use analysis and identifies instructions whose
+ * def only matters for low 32-bit, high 32-bit is never referenced later
+ * through implicit zero extension. Therefore verifier notifies JIT back-ends
+ * that it is safe to ignore clearing high 32-bit for these instructions. This
+ * saves some back-ends a lot of code-gen. However such optimization is not
+ * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends
+ * hence hasn't used verifier's analysis result. But, we really want to have a
+ * way to be able to verify the correctness of the described optimization on
+ * x86_64 on which testsuites are frequently exercised.
+ *
+ * So, this flag is introduced. Once it is set, verifier will randomize high
+ * 32-bit for those instructions who has been identified as safe to ignore them.
+ * Then, if verifier is not doing correct analysis, such randomization will
+ * regress tests to expose bugs.
+ */
+#define BPF_F_TEST_RND_HI32 (1U << 2)
+
+/* The verifier internal test flag. Behavior is undefined */
+#define BPF_F_TEST_STATE_FREQ (1U << 3)
+
+/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will
+ * restrict map and helper usage for such programs. Sleepable BPF programs can
+ * only be attached to hooks where kernel execution context allows sleeping.
+ * Such programs are allowed to use helpers that may sleep like
+ * bpf_copy_from_user().
+ */
+#define BPF_F_SLEEPABLE (1U << 4)
+
+/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program
+ * fully support xdp frags.
+ */
+#define BPF_F_XDP_HAS_FRAGS (1U << 5)
+
+/* link_create.kprobe_multi.flags used in LINK_CREATE command for
+ * BPF_TRACE_KPROBE_MULTI attach type to create return probe.
+ */
+#define BPF_F_KPROBE_MULTI_RETURN (1U << 0)
+
+/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
+ * the following extensions:
+ *
+ * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX]
+ * insn[0].imm: map fd or fd_idx
+ * insn[1].imm: 0
+ * insn[0].off: 0
+ * insn[1].off: 0
+ * ldimm64 rewrite: address of map
+ * verifier type: CONST_PTR_TO_MAP
+ */
+#define BPF_PSEUDO_MAP_FD 1
+#define BPF_PSEUDO_MAP_IDX 5
+
+/* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE
+ * insn[0].imm: map fd or fd_idx
+ * insn[1].imm: offset into value
+ * insn[0].off: 0
+ * insn[1].off: 0
+ * ldimm64 rewrite: address of map[0]+offset
+ * verifier type: PTR_TO_MAP_VALUE
+ */
+#define BPF_PSEUDO_MAP_VALUE 2
+#define BPF_PSEUDO_MAP_IDX_VALUE 6
+
+/* insn[0].src_reg: BPF_PSEUDO_BTF_ID
+ * insn[0].imm: kernel btd id of VAR
+ * insn[1].imm: 0
+ * insn[0].off: 0
+ * insn[1].off: 0
+ * ldimm64 rewrite: address of the kernel variable
+ * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
+ * is struct/union.
+ */
+#define BPF_PSEUDO_BTF_ID 3
+/* insn[0].src_reg: BPF_PSEUDO_FUNC
+ * insn[0].imm: insn offset to the func
+ * insn[1].imm: 0
+ * insn[0].off: 0
+ * insn[1].off: 0
+ * ldimm64 rewrite: address of the function
+ * verifier type: PTR_TO_FUNC.
+ */
+#define BPF_PSEUDO_FUNC 4
+
+/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
+ * offset to another bpf function
+ */
+#define BPF_PSEUDO_CALL 1
+/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL,
+ * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel
+ */
+#define BPF_PSEUDO_KFUNC_CALL 2
+
+/* flags for BPF_MAP_UPDATE_ELEM command */
+enum {
+ BPF_ANY = 0, /* create new element or update existing */
+ BPF_NOEXIST = 1, /* create new element if it didn't exist */
+ BPF_EXIST = 2, /* update existing element */
+ BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */
+};
+
+/* flags for BPF_MAP_CREATE command */
+enum {
+ BPF_F_NO_PREALLOC = (1U << 0),
+/* Instead of having one common LRU list in the
+ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
+ * which can scale and perform better.
+ * Note, the LRU nodes (including free nodes) cannot be moved
+ * across different LRU lists.
+ */
+ BPF_F_NO_COMMON_LRU = (1U << 1),
+/* Specify numa node during map creation */
+ BPF_F_NUMA_NODE = (1U << 2),
+
+/* Flags for accessing BPF object from syscall side. */
+ BPF_F_RDONLY = (1U << 3),
+ BPF_F_WRONLY = (1U << 4),
+
+/* Flag for stack_map, store build_id+offset instead of pointer */
+ BPF_F_STACK_BUILD_ID = (1U << 5),
+
+/* Zero-initialize hash function seed. This should only be used for testing. */
+ BPF_F_ZERO_SEED = (1U << 6),
+
+/* Flags for accessing BPF object from program side. */
+ BPF_F_RDONLY_PROG = (1U << 7),
+ BPF_F_WRONLY_PROG = (1U << 8),
+
+/* Clone map from listener for newly accepted socket */
+ BPF_F_CLONE = (1U << 9),
+
+/* Enable memory-mapping BPF map */
+ BPF_F_MMAPABLE = (1U << 10),
+
+/* Share perf_event among processes */
+ BPF_F_PRESERVE_ELEMS = (1U << 11),
+
+/* Create a map that is suitable to be an inner map with dynamic max entries */
+ BPF_F_INNER_MAP = (1U << 12),
+};
+
+/* Flags for BPF_PROG_QUERY. */
+
+/* Query effective (directly attached + inherited from ancestor cgroups)
+ * programs that will be executed for events within a cgroup.
+ * attach_flags with this flag are always returned 0.
+ */
+#define BPF_F_QUERY_EFFECTIVE (1U << 0)
+
+/* Flags for BPF_PROG_TEST_RUN */
+
+/* If set, run the test on the cpu specified by bpf_attr.test.cpu */
+#define BPF_F_TEST_RUN_ON_CPU (1U << 0)
+/* If set, XDP frames will be transmitted after processing */
+#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1)
+
+/* type for BPF_ENABLE_STATS */
+enum bpf_stats_type {
+ /* enabled run_time_ns and run_cnt */
+ BPF_STATS_RUN_TIME = 0,
+};
+
+enum bpf_stack_build_id_status {
+ /* user space need an empty entry to identify end of a trace */
+ BPF_STACK_BUILD_ID_EMPTY = 0,
+ /* with valid build_id and offset */
+ BPF_STACK_BUILD_ID_VALID = 1,
+ /* couldn't get build_id, fallback to ip */
+ BPF_STACK_BUILD_ID_IP = 2,
+};
+
+#define BPF_BUILD_ID_SIZE 20
+struct bpf_stack_build_id {
+ __s32 status;
+ unsigned char build_id[BPF_BUILD_ID_SIZE];
+ union {
+ __u64 offset;
+ __u64 ip;
+ };
+};
+
+#define BPF_OBJ_NAME_LEN 16U
+
+union bpf_attr {
+ struct { /* anonymous struct used by BPF_MAP_CREATE command */
+ __u32 map_type; /* one of enum bpf_map_type */
+ __u32 key_size; /* size of key in bytes */
+ __u32 value_size; /* size of value in bytes */
+ __u32 max_entries; /* max number of entries in a map */
+ __u32 map_flags; /* BPF_MAP_CREATE related
+ * flags defined above.
+ */
+ __u32 inner_map_fd; /* fd pointing to the inner map */
+ __u32 numa_node; /* numa node (effective only if
+ * BPF_F_NUMA_NODE is set).
+ */
+ char map_name[BPF_OBJ_NAME_LEN];
+ __u32 map_ifindex; /* ifindex of netdev to create on */
+ __u32 btf_fd; /* fd pointing to a BTF type data */
+ __u32 btf_key_type_id; /* BTF type_id of the key */
+ __u32 btf_value_type_id; /* BTF type_id of the value */
+ __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
+ * struct stored as the
+ * map value
+ */
+ /* Any per-map-type extra fields
+ *
+ * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the
+ * number of hash functions (if 0, the bloom filter will default
+ * to using 5 hash functions).
+ */
+ __u64 map_extra;
+ };
+
+ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+ __u32 map_fd;
+ __aligned_u64 key;
+ union {
+ __aligned_u64 value;
+ __aligned_u64 next_key;
+ };
+ __u64 flags;
+ };
+
+ struct { /* struct used by BPF_MAP_*_BATCH commands */
+ __aligned_u64 in_batch; /* start batch,
+ * NULL to start from beginning
+ */
+ __aligned_u64 out_batch; /* output: next start batch */
+ __aligned_u64 keys;
+ __aligned_u64 values;
+ __u32 count; /* input/output:
+ * input: # of key/value
+ * elements
+ * output: # of filled elements
+ */
+ __u32 map_fd;
+ __u64 elem_flags;
+ __u64 flags;
+ } batch;
+
+ struct { /* anonymous struct used by BPF_PROG_LOAD command */
+ __u32 prog_type; /* one of enum bpf_prog_type */
+ __u32 insn_cnt;
+ __aligned_u64 insns;
+ __aligned_u64 license;
+ __u32 log_level; /* verbosity level of verifier */
+ __u32 log_size; /* size of user buffer */
+ __aligned_u64 log_buf; /* user supplied buffer */
+ __u32 kern_version; /* not used */
+ __u32 prog_flags;
+ char prog_name[BPF_OBJ_NAME_LEN];
+ __u32 prog_ifindex; /* ifindex of netdev to prep for */
+ /* For some prog types expected attach type must be known at
+ * load time to verify attach type specific parts of prog
+ * (context accesses, allowed helpers, etc).
+ */
+ __u32 expected_attach_type;
+ __u32 prog_btf_fd; /* fd pointing to BTF type data */
+ __u32 func_info_rec_size; /* userspace bpf_func_info size */
+ __aligned_u64 func_info; /* func info */
+ __u32 func_info_cnt; /* number of bpf_func_info records */
+ __u32 line_info_rec_size; /* userspace bpf_line_info size */
+ __aligned_u64 line_info; /* line info */
+ __u32 line_info_cnt; /* number of bpf_line_info records */
+ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */
+ union {
+ /* valid prog_fd to attach to bpf prog */
+ __u32 attach_prog_fd;
+ /* or valid module BTF object fd or 0 to attach to vmlinux */
+ __u32 attach_btf_obj_fd;
+ };
+ __u32 core_relo_cnt; /* number of bpf_core_relo */
+ __aligned_u64 fd_array; /* array of FDs */
+ __aligned_u64 core_relos;
+ __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */
+ };
+
+ struct { /* anonymous struct used by BPF_OBJ_* commands */
+ __aligned_u64 pathname;
+ __u32 bpf_fd;
+ __u32 file_flags;
+ };
+
+ struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
+ __u32 target_fd; /* container object to attach to */
+ __u32 attach_bpf_fd; /* eBPF program to attach */
+ __u32 attach_type;
+ __u32 attach_flags;
+ __u32 replace_bpf_fd; /* previously attached eBPF
+ * program to replace if
+ * BPF_F_REPLACE is used
+ */
+ };
+
+ struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */
+ __u32 prog_fd;
+ __u32 retval;
+ __u32 data_size_in; /* input: len of data_in */
+ __u32 data_size_out; /* input/output: len of data_out
+ * returns ENOSPC if data_out
+ * is too small.
+ */
+ __aligned_u64 data_in;
+ __aligned_u64 data_out;
+ __u32 repeat;
+ __u32 duration;
+ __u32 ctx_size_in; /* input: len of ctx_in */
+ __u32 ctx_size_out; /* input/output: len of ctx_out
+ * returns ENOSPC if ctx_out
+ * is too small.
+ */
+ __aligned_u64 ctx_in;
+ __aligned_u64 ctx_out;
+ __u32 flags;
+ __u32 cpu;
+ __u32 batch_size;
+ } test;
+
+ struct { /* anonymous struct used by BPF_*_GET_*_ID */
+ union {
+ __u32 start_id;
+ __u32 prog_id;
+ __u32 map_id;
+ __u32 btf_id;
+ __u32 link_id;
+ };
+ __u32 next_id;
+ __u32 open_flags;
+ };
+
+ struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
+ __u32 bpf_fd;
+ __u32 info_len;
+ __aligned_u64 info;
+ } info;
+
+ struct { /* anonymous struct used by BPF_PROG_QUERY command */
+ __u32 target_fd; /* container object to query */
+ __u32 attach_type;
+ __u32 query_flags;
+ __u32 attach_flags;
+ __aligned_u64 prog_ids;
+ __u32 prog_cnt;
+ /* output: per-program attach_flags.
+ * not allowed to be set during effective query.
+ */
+ __aligned_u64 prog_attach_flags;
+ } query;
+
+ struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
+ __u64 name;
+ __u32 prog_fd;
+ } raw_tracepoint;
+
+ struct { /* anonymous struct for BPF_BTF_LOAD */
+ __aligned_u64 btf;
+ __aligned_u64 btf_log_buf;
+ __u32 btf_size;
+ __u32 btf_log_size;
+ __u32 btf_log_level;
+ };
+
+ struct {
+ __u32 pid; /* input: pid */
+ __u32 fd; /* input: fd */
+ __u32 flags; /* input: flags */
+ __u32 buf_len; /* input/output: buf len */
+ __aligned_u64 buf; /* input/output:
+ * tp_name for tracepoint
+ * symbol for kprobe
+ * filename for uprobe
+ */
+ __u32 prog_id; /* output: prod_id */
+ __u32 fd_type; /* output: BPF_FD_TYPE_* */
+ __u64 probe_offset; /* output: probe_offset */
+ __u64 probe_addr; /* output: probe_addr */
+ } task_fd_query;
+
+ struct { /* struct used by BPF_LINK_CREATE command */
+ __u32 prog_fd; /* eBPF program to attach */
+ union {
+ __u32 target_fd; /* object to attach to */
+ __u32 target_ifindex; /* target ifindex */
+ };
+ __u32 attach_type; /* attach type */
+ __u32 flags; /* extra flags */
+ union {
+ __u32 target_btf_id; /* btf_id of target to attach to */
+ struct {
+ __aligned_u64 iter_info; /* extra bpf_iter_link_info */
+ __u32 iter_info_len; /* iter_info length */
+ };
+ struct {
+ /* black box user-provided value passed through
+ * to BPF program at the execution time and
+ * accessible through bpf_get_attach_cookie() BPF helper
+ */
+ __u64 bpf_cookie;
+ } perf_event;
+ struct {
+ __u32 flags;
+ __u32 cnt;
+ __aligned_u64 syms;
+ __aligned_u64 addrs;
+ __aligned_u64 cookies;
+ } kprobe_multi;
+ struct {
+ /* this is overlaid with the target_btf_id above. */
+ __u32 target_btf_id;
+ /* black box user-provided value passed through
+ * to BPF program at the execution time and
+ * accessible through bpf_get_attach_cookie() BPF helper
+ */
+ __u64 cookie;
+ } tracing;
+ };
+ } link_create;
+
+ struct { /* struct used by BPF_LINK_UPDATE command */
+ __u32 link_fd; /* link fd */
+ /* new program fd to update link with */
+ __u32 new_prog_fd;
+ __u32 flags; /* extra flags */
+ /* expected link's program fd; is specified only if
+ * BPF_F_REPLACE flag is set in flags */
+ __u32 old_prog_fd;
+ } link_update;
+
+ struct {
+ __u32 link_fd;
+ } link_detach;
+
+ struct { /* struct used by BPF_ENABLE_STATS command */
+ __u32 type;
+ } enable_stats;
+
+ struct { /* struct used by BPF_ITER_CREATE command */
+ __u32 link_fd;
+ __u32 flags;
+ } iter_create;
+
+ struct { /* struct used by BPF_PROG_BIND_MAP command */
+ __u32 prog_fd;
+ __u32 map_fd;
+ __u32 flags; /* extra flags */
+ } prog_bind_map;
+
+} __attribute__((aligned(8)));
+
+/* The description below is an attempt at providing documentation to eBPF
+ * developers about the multiple available eBPF helper functions. It can be
+ * parsed and used to produce a manual page. The workflow is the following,
+ * and requires the rst2man utility:
+ *
+ * $ ./scripts/bpf_doc.py \
+ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
+ * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
+ * $ man /tmp/bpf-helpers.7
+ *
+ * Note that in order to produce this external documentation, some RST
+ * formatting is used in the descriptions to get "bold" and "italics" in
+ * manual pages. Also note that the few trailing white spaces are
+ * intentional, removing them would break paragraphs for rst2man.
+ *
+ * Start of BPF helper function descriptions:
+ *
+ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key)
+ * Description
+ * Perform a lookup in *map* for an entry associated to *key*.
+ * Return
+ * Map value associated to *key*, or **NULL** if no entry was
+ * found.
+ *
+ * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags)
+ * Description
+ * Add or update the value of the entry associated to *key* in
+ * *map* with *value*. *flags* is one of:
+ *
+ * **BPF_NOEXIST**
+ * The entry for *key* must not exist in the map.
+ * **BPF_EXIST**
+ * The entry for *key* must already exist in the map.
+ * **BPF_ANY**
+ * No condition on the existence of the entry for *key*.
+ *
+ * Flag value **BPF_NOEXIST** cannot be used for maps of types
+ * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all
+ * elements always exist), the helper would return an error.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_map_delete_elem(struct bpf_map *map, const void *key)
+ * Description
+ * Delete entry with *key* from *map*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * For tracing programs, safely attempt to read *size* bytes from
+ * kernel space address *unsafe_ptr* and store the data in *dst*.
+ *
+ * Generally, use **bpf_probe_read_user**\ () or
+ * **bpf_probe_read_kernel**\ () instead.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_ktime_get_ns(void)
+ * Description
+ * Return the time elapsed since system boot, in nanoseconds.
+ * Does not include time the system was suspended.
+ * See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
+ * Return
+ * Current *ktime*.
+ *
+ * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...)
+ * Description
+ * This helper is a "printk()-like" facility for debugging. It
+ * prints a message defined by format *fmt* (of size *fmt_size*)
+ * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if
+ * available. It can take up to three additional **u64**
+ * arguments (as an eBPF helpers, the total number of arguments is
+ * limited to five).
+ *
+ * Each time the helper is called, it appends a line to the trace.
+ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is
+ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this.
+ * The format of the trace is customizable, and the exact output
+ * one will get depends on the options set in
+ * *\/sys/kernel/debug/tracing/trace_options* (see also the
+ * *README* file under the same directory). However, it usually
+ * defaults to something like:
+ *
+ * ::
+ *
+ * telnet-470 [001] .N.. 419421.045894: 0x00000001: <formatted msg>
+ *
+ * In the above:
+ *
+ * * ``telnet`` is the name of the current task.
+ * * ``470`` is the PID of the current task.
+ * * ``001`` is the CPU number on which the task is
+ * running.
+ * * In ``.N..``, each character refers to a set of
+ * options (whether irqs are enabled, scheduling
+ * options, whether hard/softirqs are running, level of
+ * preempt_disabled respectively). **N** means that
+ * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED**
+ * are set.
+ * * ``419421.045894`` is a timestamp.
+ * * ``0x00000001`` is a fake value used by BPF for the
+ * instruction pointer register.
+ * * ``<formatted msg>`` is the message formatted with
+ * *fmt*.
+ *
+ * The conversion specifiers supported by *fmt* are similar, but
+ * more limited than for printk(). They are **%d**, **%i**,
+ * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**,
+ * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size
+ * of field, padding with zeroes, etc.) is available, and the
+ * helper will return **-EINVAL** (but print nothing) if it
+ * encounters an unknown specifier.
+ *
+ * Also, note that **bpf_trace_printk**\ () is slow, and should
+ * only be used for debugging purposes. For this reason, a notice
+ * block (spanning several lines) is printed to kernel logs and
+ * states that the helper should not be used "for production use"
+ * the first time this helper is used (or more precisely, when
+ * **trace_printk**\ () buffers are allocated). For passing values
+ * to user space, perf events should be preferred.
+ * Return
+ * The number of bytes written to the buffer, or a negative error
+ * in case of failure.
+ *
+ * u32 bpf_get_prandom_u32(void)
+ * Description
+ * Get a pseudo-random number.
+ *
+ * From a security point of view, this helper uses its own
+ * pseudo-random internal state, and cannot be used to infer the
+ * seed of other random functions in the kernel. However, it is
+ * essential to note that the generator used by the helper is not
+ * cryptographically secure.
+ * Return
+ * A random 32-bit unsigned value.
+ *
+ * u32 bpf_get_smp_processor_id(void)
+ * Description
+ * Get the SMP (symmetric multiprocessing) processor id. Note that
+ * all programs run with migration disabled, which means that the
+ * SMP processor id is stable during all the execution of the
+ * program.
+ * Return
+ * The SMP id of the processor running the program.
+ *
+ * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags)
+ * Description
+ * Store *len* bytes from address *from* into the packet
+ * associated to *skb*, at *offset*. *flags* are a combination of
+ * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the
+ * checksum for the packet after storing the bytes) and
+ * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\
+ * **->swhash** and *skb*\ **->l4hash** to 0).
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size)
+ * Description
+ * Recompute the layer 3 (e.g. IP) checksum for the packet
+ * associated to *skb*. Computation is incremental, so the helper
+ * must know the former value of the header field that was
+ * modified (*from*), the new value of this field (*to*), and the
+ * number of bytes (2 or 4) for this field, stored in *size*.
+ * Alternatively, it is possible to store the difference between
+ * the previous and the new values of the header field in *to*, by
+ * setting *from* and *size* to 0. For both methods, *offset*
+ * indicates the location of the IP checksum within the packet.
+ *
+ * This helper works in combination with **bpf_csum_diff**\ (),
+ * which does not update the checksum in-place, but offers more
+ * flexibility and can handle sizes larger than 2 or 4 for the
+ * checksum to update.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags)
+ * Description
+ * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the
+ * packet associated to *skb*. Computation is incremental, so the
+ * helper must know the former value of the header field that was
+ * modified (*from*), the new value of this field (*to*), and the
+ * number of bytes (2 or 4) for this field, stored on the lowest
+ * four bits of *flags*. Alternatively, it is possible to store
+ * the difference between the previous and the new values of the
+ * header field in *to*, by setting *from* and the four lowest
+ * bits of *flags* to 0. For both methods, *offset* indicates the
+ * location of the IP checksum within the packet. In addition to
+ * the size of the field, *flags* can be added (bitwise OR) actual
+ * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left
+ * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and
+ * for updates resulting in a null checksum the value is set to
+ * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates
+ * the checksum is to be computed against a pseudo-header.
+ *
+ * This helper works in combination with **bpf_csum_diff**\ (),
+ * which does not update the checksum in-place, but offers more
+ * flexibility and can handle sizes larger than 2 or 4 for the
+ * checksum to update.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index)
+ * Description
+ * This special helper is used to trigger a "tail call", or in
+ * other words, to jump into another eBPF program. The same stack
+ * frame is used (but values on stack and in registers for the
+ * caller are not accessible to the callee). This mechanism allows
+ * for program chaining, either for raising the maximum number of
+ * available eBPF instructions, or to execute given programs in
+ * conditional blocks. For security reasons, there is an upper
+ * limit to the number of successive tail calls that can be
+ * performed.
+ *
+ * Upon call of this helper, the program attempts to jump into a
+ * program referenced at index *index* in *prog_array_map*, a
+ * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes
+ * *ctx*, a pointer to the context.
+ *
+ * If the call succeeds, the kernel immediately runs the first
+ * instruction of the new program. This is not a function call,
+ * and it never returns to the previous program. If the call
+ * fails, then the helper has no effect, and the caller continues
+ * to run its subsequent instructions. A call can fail if the
+ * destination program for the jump does not exist (i.e. *index*
+ * is superior to the number of entries in *prog_array_map*), or
+ * if the maximum number of tail calls has been reached for this
+ * chain of programs. This limit is defined in the kernel by the
+ * macro **MAX_TAIL_CALL_CNT** (not accessible to user space),
+ * which is currently set to 33.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags)
+ * Description
+ * Clone and redirect the packet associated to *skb* to another
+ * net device of index *ifindex*. Both ingress and egress
+ * interfaces can be used for redirection. The **BPF_F_INGRESS**
+ * value in *flags* is used to make the distinction (ingress path
+ * is selected if the flag is present, egress path otherwise).
+ * This is the only flag supported for now.
+ *
+ * In comparison with **bpf_redirect**\ () helper,
+ * **bpf_clone_redirect**\ () has the associated cost of
+ * duplicating the packet buffer, but this can be executed out of
+ * the eBPF program. Conversely, **bpf_redirect**\ () is more
+ * efficient, but it is handled through an action code where the
+ * redirection happens only after the eBPF program has returned.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_current_pid_tgid(void)
+ * Description
+ * Get the current pid and tgid.
+ * Return
+ * A 64-bit integer containing the current tgid and pid, and
+ * created as such:
+ * *current_task*\ **->tgid << 32 \|**
+ * *current_task*\ **->pid**.
+ *
+ * u64 bpf_get_current_uid_gid(void)
+ * Description
+ * Get the current uid and gid.
+ * Return
+ * A 64-bit integer containing the current GID and UID, and
+ * created as such: *current_gid* **<< 32 \|** *current_uid*.
+ *
+ * long bpf_get_current_comm(void *buf, u32 size_of_buf)
+ * Description
+ * Copy the **comm** attribute of the current task into *buf* of
+ * *size_of_buf*. The **comm** attribute contains the name of
+ * the executable (excluding the path) for the current task. The
+ * *size_of_buf* must be strictly positive. On success, the
+ * helper makes sure that the *buf* is NUL-terminated. On failure,
+ * it is filled with zeroes.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u32 bpf_get_cgroup_classid(struct sk_buff *skb)
+ * Description
+ * Retrieve the classid for the current task, i.e. for the net_cls
+ * cgroup to which *skb* belongs.
+ *
+ * This helper can be used on TC egress path, but not on ingress.
+ *
+ * The net_cls cgroup provides an interface to tag network packets
+ * based on a user-provided identifier for all traffic coming from
+ * the tasks belonging to the related cgroup. See also the related
+ * kernel documentation, available from the Linux sources in file
+ * *Documentation/admin-guide/cgroup-v1/net_cls.rst*.
+ *
+ * The Linux kernel has two versions for cgroups: there are
+ * cgroups v1 and cgroups v2. Both are available to users, who can
+ * use a mixture of them, but note that the net_cls cgroup is for
+ * cgroup v1 only. This makes it incompatible with BPF programs
+ * run on cgroups, which is a cgroup-v2-only feature (a socket can
+ * only hold data for one version of cgroups at a time).
+ *
+ * This helper is only available is the kernel was compiled with
+ * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to
+ * "**y**" or to "**m**".
+ * Return
+ * The classid, or 0 for the default unconfigured classid.
+ *
+ * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
+ * Description
+ * Push a *vlan_tci* (VLAN tag control information) of protocol
+ * *vlan_proto* to the packet associated to *skb*, then update
+ * the checksum. Note that if *vlan_proto* is different from
+ * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to
+ * be **ETH_P_8021Q**.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_vlan_pop(struct sk_buff *skb)
+ * Description
+ * Pop a VLAN header from the packet associated to *skb*.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * Description
+ * Get tunnel metadata. This helper takes a pointer *key* to an
+ * empty **struct bpf_tunnel_key** of **size**, that will be
+ * filled with tunnel metadata for the packet associated to *skb*.
+ * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which
+ * indicates that the tunnel is based on IPv6 protocol instead of
+ * IPv4.
+ *
+ * The **struct bpf_tunnel_key** is an object that generalizes the
+ * principal parameters used by various tunneling protocols into a
+ * single struct. This way, it can be used to easily make a
+ * decision based on the contents of the encapsulation header,
+ * "summarized" in this struct. In particular, it holds the IP
+ * address of the remote end (IPv4 or IPv6, depending on the case)
+ * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also,
+ * this struct exposes the *key*\ **->tunnel_id**, which is
+ * generally mapped to a VNI (Virtual Network Identifier), making
+ * it programmable together with the **bpf_skb_set_tunnel_key**\
+ * () helper.
+ *
+ * Let's imagine that the following code is part of a program
+ * attached to the TC ingress interface, on one end of a GRE
+ * tunnel, and is supposed to filter out all messages coming from
+ * remote ends with IPv4 address other than 10.0.0.1:
+ *
+ * ::
+ *
+ * int ret;
+ * struct bpf_tunnel_key key = {};
+ *
+ * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ * if (ret < 0)
+ * return TC_ACT_SHOT; // drop packet
+ *
+ * if (key.remote_ipv4 != 0x0a000001)
+ * return TC_ACT_SHOT; // drop packet
+ *
+ * return TC_ACT_OK; // accept packet
+ *
+ * This interface can also be used with all encapsulation devices
+ * that can operate in "collect metadata" mode: instead of having
+ * one network device per specific configuration, the "collect
+ * metadata" mode only requires a single device where the
+ * configuration can be extracted from this helper.
+ *
+ * This can be used together with various tunnels such as VXLan,
+ * Geneve, GRE or IP in IP (IPIP).
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags)
+ * Description
+ * Populate tunnel metadata for packet associated to *skb.* The
+ * tunnel metadata is set to the contents of *key*, of *size*. The
+ * *flags* can be set to a combination of the following values:
+ *
+ * **BPF_F_TUNINFO_IPV6**
+ * Indicate that the tunnel is based on IPv6 protocol
+ * instead of IPv4.
+ * **BPF_F_ZERO_CSUM_TX**
+ * For IPv4 packets, add a flag to tunnel metadata
+ * indicating that checksum computation should be skipped
+ * and checksum set to zeroes.
+ * **BPF_F_DONT_FRAGMENT**
+ * Add a flag to tunnel metadata indicating that the
+ * packet should not be fragmented.
+ * **BPF_F_SEQ_NUMBER**
+ * Add a flag to tunnel metadata indicating that a
+ * sequence number should be added to tunnel header before
+ * sending the packet. This flag was added for GRE
+ * encapsulation, but might be used with other protocols
+ * as well in the future.
+ *
+ * Here is a typical usage on the transmit path:
+ *
+ * ::
+ *
+ * struct bpf_tunnel_key key;
+ * populate key ...
+ * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0);
+ *
+ * See also the description of the **bpf_skb_get_tunnel_key**\ ()
+ * helper for additional information.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags)
+ * Description
+ * Read the value of a perf event counter. This helper relies on a
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of
+ * the perf event counter is selected when *map* is updated with
+ * perf event file descriptors. The *map* is an array whose size
+ * is the number of available CPUs, and each cell contains a value
+ * relative to one CPU. The value to retrieve is indicated by
+ * *flags*, that contains the index of the CPU to look up, masked
+ * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * **BPF_F_CURRENT_CPU** to indicate that the value for the
+ * current CPU should be retrieved.
+ *
+ * Note that before Linux 4.13, only hardware perf event can be
+ * retrieved.
+ *
+ * Also, be aware that the newer helper
+ * **bpf_perf_event_read_value**\ () is recommended over
+ * **bpf_perf_event_read**\ () in general. The latter has some ABI
+ * quirks where error and counter value are used as a return code
+ * (which is wrong to do since ranges may overlap). This issue is
+ * fixed with **bpf_perf_event_read_value**\ (), which at the same
+ * time provides more features over the **bpf_perf_event_read**\
+ * () interface. Please refer to the description of
+ * **bpf_perf_event_read_value**\ () for details.
+ * Return
+ * The value of the perf event counter read from the map, or a
+ * negative error code in case of failure.
+ *
+ * long bpf_redirect(u32 ifindex, u64 flags)
+ * Description
+ * Redirect the packet to another net device of index *ifindex*.
+ * This helper is somewhat similar to **bpf_clone_redirect**\
+ * (), except that the packet is not cloned, which provides
+ * increased performance.
+ *
+ * Except for XDP, both ingress and egress interfaces can be used
+ * for redirection. The **BPF_F_INGRESS** value in *flags* is used
+ * to make the distinction (ingress path is selected if the flag
+ * is present, egress path otherwise). Currently, XDP only
+ * supports redirection to the egress interface, and accepts no
+ * flag at all.
+ *
+ * The same effect can also be attained with the more generic
+ * **bpf_redirect_map**\ (), which uses a BPF map to store the
+ * redirect target instead of providing it directly to the helper.
+ * Return
+ * For XDP, the helper returns **XDP_REDIRECT** on success or
+ * **XDP_ABORTED** on error. For other program types, the values
+ * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on
+ * error.
+ *
+ * u32 bpf_get_route_realm(struct sk_buff *skb)
+ * Description
+ * Retrieve the realm or the route, that is to say the
+ * **tclassid** field of the destination for the *skb*. The
+ * identifier retrieved is a user-provided tag, similar to the
+ * one used with the net_cls cgroup (see description for
+ * **bpf_get_cgroup_classid**\ () helper), but here this tag is
+ * held by a route (a destination entry), not by a task.
+ *
+ * Retrieving this identifier works with the clsact TC egress hook
+ * (see also **tc-bpf(8)**), or alternatively on conventional
+ * classful egress qdiscs, but not on TC ingress path. In case of
+ * clsact TC egress hook, this has the advantage that, internally,
+ * the destination entry has not been dropped yet in the transmit
+ * path. Therefore, the destination entry does not need to be
+ * artificially held via **netif_keep_dst**\ () for a classful
+ * qdisc until the *skb* is freed.
+ *
+ * This helper is available only if the kernel was compiled with
+ * **CONFIG_IP_ROUTE_CLASSID** configuration option.
+ * Return
+ * The realm of the route for the packet associated to *skb*, or 0
+ * if none was found.
+ *
+ * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * The context of the program *ctx* needs also be passed to the
+ * helper.
+ *
+ * On user space, a program willing to read the values needs to
+ * call **perf_event_open**\ () on the perf event (either for
+ * one or for all CPUs) and to store the file descriptor into the
+ * *map*. This must be done before the eBPF program can send data
+ * into it. An example is available in file
+ * *samples/bpf/trace_output_user.c* in the Linux kernel source
+ * tree (the eBPF program counterpart is in
+ * *samples/bpf/trace_output_kern.c*).
+ *
+ * **bpf_perf_event_output**\ () achieves better performance
+ * than **bpf_trace_printk**\ () for sharing data with user
+ * space, and is much better suitable for streaming data from eBPF
+ * programs.
+ *
+ * Note that this helper is not restricted to tracing use cases
+ * and can be used with programs attached to TC or XDP as well,
+ * where it allows for passing data to user space listeners. Data
+ * can be:
+ *
+ * * Only custom structs,
+ * * Only the packet payload, or
+ * * A combination of both.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len)
+ * Description
+ * This helper was provided as an easy way to load data from a
+ * packet. It can be used to load *len* bytes from *offset* from
+ * the packet associated to *skb*, into the buffer pointed by
+ * *to*.
+ *
+ * Since Linux 4.7, usage of this helper has mostly been replaced
+ * by "direct packet access", enabling packet data to be
+ * manipulated with *skb*\ **->data** and *skb*\ **->data_end**
+ * pointing respectively to the first byte of packet data and to
+ * the byte after the last byte of packet data. However, it
+ * remains useful if one wishes to read large quantities of data
+ * at once from a packet into the eBPF stack.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags)
+ * Description
+ * Walk a user or a kernel stack and return its id. To achieve
+ * this, the helper needs *ctx*, which is a pointer to the context
+ * on which the tracing program is executed, and a pointer to a
+ * *map* of type **BPF_MAP_TYPE_STACK_TRACE**.
+ *
+ * The last argument, *flags*, holds the number of stack frames to
+ * skip (from 0 to 255), masked with
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * a combination of the following flags:
+ *
+ * **BPF_F_USER_STACK**
+ * Collect a user space stack instead of a kernel stack.
+ * **BPF_F_FAST_STACK_CMP**
+ * Compare stacks by hash only.
+ * **BPF_F_REUSE_STACKID**
+ * If two different stacks hash into the same *stackid*,
+ * discard the old one.
+ *
+ * The stack id retrieved is a 32 bit long integer handle which
+ * can be further combined with other data (including other stack
+ * ids) and used as a key into maps. This can be useful for
+ * generating a variety of graphs (such as flame graphs or off-cpu
+ * graphs).
+ *
+ * For walking a stack, this helper is an improvement over
+ * **bpf_probe_read**\ (), which can be used with unrolled loops
+ * but is not efficient and consumes a lot of eBPF instructions.
+ * Instead, **bpf_get_stackid**\ () can collect up to
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that
+ * this limit can be controlled with the **sysctl** program, and
+ * that it should be manually increased in order to profile long
+ * user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * ::
+ *
+ * # sysctl kernel.perf_event_max_stack=<new value>
+ * Return
+ * The positive or null stack id on success, or a negative error
+ * in case of failure.
+ *
+ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed)
+ * Description
+ * Compute a checksum difference, from the raw buffer pointed by
+ * *from*, of length *from_size* (that must be a multiple of 4),
+ * towards the raw buffer pointed by *to*, of size *to_size*
+ * (same remark). An optional *seed* can be added to the value
+ * (this can be cascaded, the seed may come from a previous call
+ * to the helper).
+ *
+ * This is flexible enough to be used in several ways:
+ *
+ * * With *from_size* == 0, *to_size* > 0 and *seed* set to
+ * checksum, it can be used when pushing new data.
+ * * With *from_size* > 0, *to_size* == 0 and *seed* set to
+ * checksum, it can be used when removing data from a packet.
+ * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it
+ * can be used to compute a diff. Note that *from_size* and
+ * *to_size* do not need to be equal.
+ *
+ * This helper can be used in combination with
+ * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to
+ * which one can feed in the difference computed with
+ * **bpf_csum_diff**\ ().
+ * Return
+ * The checksum result, or a negative error code in case of
+ * failure.
+ *
+ * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
+ * Description
+ * Retrieve tunnel options metadata for the packet associated to
+ * *skb*, and store the raw tunnel option data to the buffer *opt*
+ * of *size*.
+ *
+ * This helper can be used with encapsulation devices that can
+ * operate in "collect metadata" mode (please refer to the related
+ * note in the description of **bpf_skb_get_tunnel_key**\ () for
+ * more details). A particular example where this can be used is
+ * in combination with the Geneve encapsulation protocol, where it
+ * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper)
+ * and retrieving arbitrary TLVs (Type-Length-Value headers) from
+ * the eBPF program. This allows for full customization of these
+ * headers.
+ * Return
+ * The size of the option data retrieved.
+ *
+ * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size)
+ * Description
+ * Set tunnel options metadata for the packet associated to *skb*
+ * to the option data contained in the raw buffer *opt* of *size*.
+ *
+ * See also the description of the **bpf_skb_get_tunnel_opt**\ ()
+ * helper for additional information.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags)
+ * Description
+ * Change the protocol of the *skb* to *proto*. Currently
+ * supported are transition from IPv4 to IPv6, and from IPv6 to
+ * IPv4. The helper takes care of the groundwork for the
+ * transition, including resizing the socket buffer. The eBPF
+ * program is expected to fill the new headers, if any, via
+ * **skb_store_bytes**\ () and to recompute the checksums with
+ * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\
+ * (). The main case for this helper is to perform NAT64
+ * operations out of an eBPF program.
+ *
+ * Internally, the GSO type is marked as dodgy so that headers are
+ * checked and segments are recalculated by the GSO/GRO engine.
+ * The size for GSO target is adapted as well.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_change_type(struct sk_buff *skb, u32 type)
+ * Description
+ * Change the packet type for the packet associated to *skb*. This
+ * comes down to setting *skb*\ **->pkt_type** to *type*, except
+ * the eBPF program does not have a write access to *skb*\
+ * **->pkt_type** beside this helper. Using a helper here allows
+ * for graceful handling of errors.
+ *
+ * The major use case is to change incoming *skb*s to
+ * **PACKET_HOST** in a programmatic way instead of having to
+ * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for
+ * example.
+ *
+ * Note that *type* only allows certain values. At this time, they
+ * are:
+ *
+ * **PACKET_HOST**
+ * Packet is for us.
+ * **PACKET_BROADCAST**
+ * Send packet to all.
+ * **PACKET_MULTICAST**
+ * Send packet to group.
+ * **PACKET_OTHERHOST**
+ * Send packet to someone else.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index)
+ * Description
+ * Check whether *skb* is a descendant of the cgroup2 held by
+ * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * Return
+ * The return value depends on the result of the test, and can be:
+ *
+ * * 0, if the *skb* failed the cgroup2 descendant test.
+ * * 1, if the *skb* succeeded the cgroup2 descendant test.
+ * * A negative error code, if an error occurred.
+ *
+ * u32 bpf_get_hash_recalc(struct sk_buff *skb)
+ * Description
+ * Retrieve the hash of the packet, *skb*\ **->hash**. If it is
+ * not set, in particular if the hash was cleared due to mangling,
+ * recompute this hash. Later accesses to the hash can be done
+ * directly with *skb*\ **->hash**.
+ *
+ * Calling **bpf_set_hash_invalid**\ (), changing a packet
+ * prototype with **bpf_skb_change_proto**\ (), or calling
+ * **bpf_skb_store_bytes**\ () with the
+ * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear
+ * the hash and to trigger a new computation for the next call to
+ * **bpf_get_hash_recalc**\ ().
+ * Return
+ * The 32-bit hash.
+ *
+ * u64 bpf_get_current_task(void)
+ * Description
+ * Get the current task.
+ * Return
+ * A pointer to the current task struct.
+ *
+ * long bpf_probe_write_user(void *dst, const void *src, u32 len)
+ * Description
+ * Attempt in a safe way to write *len* bytes from the buffer
+ * *src* to *dst* in memory. It only works for threads that are in
+ * user context, and *dst* must be a valid user space address.
+ *
+ * This helper should not be used to implement any kind of
+ * security mechanism because of TOC-TOU attacks, but rather to
+ * debug, divert, and manipulate execution of semi-cooperative
+ * processes.
+ *
+ * Keep in mind that this feature is meant for experiments, and it
+ * has a risk of crashing the system and running programs.
+ * Therefore, when an eBPF program using this helper is attached,
+ * a warning including PID and process name is printed to kernel
+ * logs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index)
+ * Description
+ * Check whether the probe is being run is the context of a given
+ * subset of the cgroup2 hierarchy. The cgroup2 to test is held by
+ * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*.
+ * Return
+ * The return value depends on the result of the test, and can be:
+ *
+ * * 1, if current task belongs to the cgroup2.
+ * * 0, if current task does not belong to the cgroup2.
+ * * A negative error code, if an error occurred.
+ *
+ * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags)
+ * Description
+ * Resize (trim or grow) the packet associated to *skb* to the
+ * new *len*. The *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * The basic idea is that the helper performs the needed work to
+ * change the size of the packet, then the eBPF program rewrites
+ * the rest via helpers like **bpf_skb_store_bytes**\ (),
+ * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ ()
+ * and others. This helper is a slow path utility intended for
+ * replies with control messages. And because it is targeted for
+ * slow path, the helper itself can afford to be slow: it
+ * implicitly linearizes, unclones and drops offloads from the
+ * *skb*.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_pull_data(struct sk_buff *skb, u32 len)
+ * Description
+ * Pull in non-linear data in case the *skb* is non-linear and not
+ * all of *len* are part of the linear section. Make *len* bytes
+ * from *skb* readable and writable. If a zero value is passed for
+ * *len*, then all bytes in the linear part of *skb* will be made
+ * readable and writable.
+ *
+ * This helper is only needed for reading and writing with direct
+ * packet access.
+ *
+ * For direct packet access, testing that offsets to access
+ * are within packet boundaries (test on *skb*\ **->data_end**) is
+ * susceptible to fail if offsets are invalid, or if the requested
+ * data is in non-linear parts of the *skb*. On failure the
+ * program can just bail out, or in the case of a non-linear
+ * buffer, use a helper to make the data available. The
+ * **bpf_skb_load_bytes**\ () helper is a first solution to access
+ * the data. Another one consists in using **bpf_skb_pull_data**
+ * to pull in once the non-linear parts, then retesting and
+ * eventually access the data.
+ *
+ * At the same time, this also makes sure the *skb* is uncloned,
+ * which is a necessary condition for direct write. As this needs
+ * to be an invariant for the write part only, the verifier
+ * detects writes and adds a prologue that is calling
+ * **bpf_skb_pull_data()** to effectively unclone the *skb* from
+ * the very beginning in case it is indeed cloned.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum)
+ * Description
+ * Add the checksum *csum* into *skb*\ **->csum** in case the
+ * driver has supplied a checksum for the entire packet into that
+ * field. Return an error otherwise. This helper is intended to be
+ * used in combination with **bpf_csum_diff**\ (), in particular
+ * when the checksum needs to be updated after data has been
+ * written into the packet through direct packet access.
+ * Return
+ * The checksum on success, or a negative error code in case of
+ * failure.
+ *
+ * void bpf_set_hash_invalid(struct sk_buff *skb)
+ * Description
+ * Invalidate the current *skb*\ **->hash**. It can be used after
+ * mangling on headers through direct packet access, in order to
+ * indicate that the hash is outdated and to trigger a
+ * recalculation the next time the kernel tries to access this
+ * hash or when the **bpf_get_hash_recalc**\ () helper is called.
+ * Return
+ * void.
+ *
+ * long bpf_get_numa_node_id(void)
+ * Description
+ * Return the id of the current NUMA node. The primary use case
+ * for this helper is the selection of sockets for the local NUMA
+ * node, when the program is attached to sockets using the
+ * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**),
+ * but the helper is also available to other eBPF program types,
+ * similarly to **bpf_get_smp_processor_id**\ ().
+ * Return
+ * The id of current NUMA node.
+ *
+ * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags)
+ * Description
+ * Grows headroom of packet associated to *skb* and adjusts the
+ * offset of the MAC header accordingly, adding *len* bytes of
+ * space. It automatically extends and reallocates memory as
+ * required.
+ *
+ * This helper can be used on a layer 3 *skb* to push a MAC header
+ * for redirection into a layer 2 device.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta)
+ * Description
+ * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that
+ * it is possible to use a negative value for *delta*. This helper
+ * can be used to prepare the packet for pushing or popping
+ * headers.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe kernel address
+ * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
+ * more details.
+ *
+ * Generally, use **bpf_probe_read_user_str**\ () or
+ * **bpf_probe_read_kernel_str**\ () instead.
+ * Return
+ * On success, the strictly positive length of the string,
+ * including the trailing NUL character. On error, a negative
+ * value.
+ *
+ * u64 bpf_get_socket_cookie(struct sk_buff *skb)
+ * Description
+ * If the **struct sk_buff** pointed by *skb* has a known socket,
+ * retrieve the cookie (generated by the kernel) of this socket.
+ * If no cookie has been set yet, generate a new cookie. Once
+ * generated, the socket cookie remains stable for the life of the
+ * socket. This helper can be useful for monitoring per socket
+ * networking traffic statistics as it provides a global socket
+ * identifier that can be assumed unique.
+ * Return
+ * A 8-byte long unique number on success, or 0 if the socket
+ * field is missing inside *skb*.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx)
+ * Description
+ * Equivalent to bpf_get_socket_cookie() helper that accepts
+ * *skb*, but gets socket from **struct bpf_sock_addr** context.
+ * Return
+ * A 8-byte long unique number.
+ *
+ * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
+ * Description
+ * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * *skb*, but gets socket from **struct bpf_sock_ops** context.
+ * Return
+ * A 8-byte long unique number.
+ *
+ * u64 bpf_get_socket_cookie(struct sock *sk)
+ * Description
+ * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
+ * *sk*, but gets socket from a BTF **struct sock**. This helper
+ * also works for sleepable programs.
+ * Return
+ * A 8-byte long unique number or 0 if *sk* is NULL.
+ *
+ * u32 bpf_get_socket_uid(struct sk_buff *skb)
+ * Description
+ * Get the owner UID of the socked associated to *skb*.
+ * Return
+ * The owner UID of the socket associated to *skb*. If the socket
+ * is **NULL**, or if it is not a full socket (i.e. if it is a
+ * time-wait or a request socket instead), **overflowuid** value
+ * is returned (note that **overflowuid** might also be the actual
+ * UID value for the socket).
+ *
+ * long bpf_set_hash(struct sk_buff *skb, u32 hash)
+ * Description
+ * Set the full hash for *skb* (set the field *skb*\ **->hash**)
+ * to value *hash*.
+ * Return
+ * 0
+ *
+ * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
+ * Description
+ * Emulate a call to **setsockopt()** on the socket associated to
+ * *bpf_socket*, which must be a full socket. The *level* at
+ * which the option resides and the name *optname* of the option
+ * must be specified, see **setsockopt(2)** for more information.
+ * The option value of length *optlen* is pointed by *optval*.
+ *
+ * *bpf_socket* should be one of the following:
+ *
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * and **BPF_CGROUP_INET6_CONNECT**.
+ *
+ * This helper actually implements a subset of **setsockopt()**.
+ * It supports the following *level*\ s:
+ *
+ * * **SOL_SOCKET**, which supports the following *optname*\ s:
+ * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**,
+ * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**,
+ * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**,
+ * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**.
+ * * **IPPROTO_TCP**, which supports the following *optname*\ s:
+ * **TCP_CONGESTION**, **TCP_BPF_IW**,
+ * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
+ * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
+ * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
+ * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
+ * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
+ * **TCP_BPF_RTO_MIN**.
+ * * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
+ * * **IPPROTO_IPV6**, which supports the following *optname*\ s:
+ * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags)
+ * Description
+ * Grow or shrink the room for data in the packet associated to
+ * *skb* by *len_diff*, and according to the selected *mode*.
+ *
+ * By default, the helper will reset any offloaded checksum
+ * indicator of the skb to CHECKSUM_NONE. This can be avoided
+ * by the following flag:
+ *
+ * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded
+ * checksum data of the skb to CHECKSUM_NONE.
+ *
+ * There are two supported modes at this time:
+ *
+ * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
+ * (room space is added or removed between the layer 2 and
+ * layer 3 headers).
+ *
+ * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
+ * (room space is added or removed between the layer 3 and
+ * layer 4 headers).
+ *
+ * The following flags are supported at this time:
+ *
+ * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
+ * Adjusting mss in this way is not allowed for datagrams.
+ *
+ * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**,
+ * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**:
+ * Any new space is reserved to hold a tunnel header.
+ * Configure skb offsets and other fields accordingly.
+ *
+ * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**,
+ * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**:
+ * Use with ENCAP_L3 flags to further specify the tunnel type.
+ *
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*):
+ * Use with ENCAP_L3/L4 flags to further specify the tunnel
+ * type; *len* is the length of the inner MAC header.
+ *
+ * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ * L2 type as Ethernet.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags)
+ * Description
+ * Redirect the packet to the endpoint referenced by *map* at
+ * index *key*. Depending on its type, this *map* can contain
+ * references to net devices (for forwarding packets through other
+ * ports), or to CPUs (for redirecting XDP frames to another CPU;
+ * but this is only implemented for native XDP (with driver
+ * support) as of this writing).
+ *
+ * The lower two bits of *flags* are used as the return code if
+ * the map lookup fails. This is so that the return value can be
+ * one of the XDP program return codes up to **XDP_TX**, as chosen
+ * by the caller. The higher bits of *flags* can be set to
+ * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below.
+ *
+ * With BPF_F_BROADCAST the packet will be broadcasted to all the
+ * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress
+ * interface will be excluded when do broadcasting.
+ *
+ * See also **bpf_redirect**\ (), which only supports redirecting
+ * to an ifindex, but doesn't require a map to do so.
+ * Return
+ * **XDP_REDIRECT** on success, or the value of the two lower bits
+ * of the *flags* argument on error.
+ *
+ * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags)
+ * Description
+ * Redirect the packet to the socket referenced by *map* (of type
+ * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * egress interfaces can be used for redirection. The
+ * **BPF_F_INGRESS** value in *flags* is used to make the
+ * distinction (ingress path is selected if the flag is present,
+ * egress path otherwise). This is the only flag supported for now.
+ * Return
+ * **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * Add an entry to, or update a *map* referencing sockets. The
+ * *skops* is used as a new value for the entry associated to
+ * *key*. *flags* is one of:
+ *
+ * **BPF_NOEXIST**
+ * The entry for *key* must not exist in the map.
+ * **BPF_EXIST**
+ * The entry for *key* must already exist in the map.
+ * **BPF_ANY**
+ * No condition on the existence of the entry for *key*.
+ *
+ * If the *map* has eBPF programs (parser and verdict), those will
+ * be inherited by the socket being added. If the socket is
+ * already attached to eBPF programs, this results in an error.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta)
+ * Description
+ * Adjust the address pointed by *xdp_md*\ **->data_meta** by
+ * *delta* (which can be positive or negative). Note that this
+ * operation modifies the address stored in *xdp_md*\ **->data**,
+ * so the latter must be loaded only after the helper has been
+ * called.
+ *
+ * The use of *xdp_md*\ **->data_meta** is optional and programs
+ * are not required to use it. The rationale is that when the
+ * packet is processed with XDP (e.g. as DoS filter), it is
+ * possible to push further meta data along with it before passing
+ * to the stack, and to give the guarantee that an ingress eBPF
+ * program attached as a TC classifier on the same device can pick
+ * this up for further post-processing. Since TC works with socket
+ * buffers, it remains possible to set from XDP the **mark** or
+ * **priority** pointers, or other pointers for the socket buffer.
+ * Having this scratch space generic and programmable allows for
+ * more flexibility as the user is free to store whatever meta
+ * data they need.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size)
+ * Description
+ * Read the value of a perf event counter, and store it into *buf*
+ * of size *buf_size*. This helper relies on a *map* of type
+ * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event
+ * counter is selected when *map* is updated with perf event file
+ * descriptors. The *map* is an array whose size is the number of
+ * available CPUs, and each cell contains a value relative to one
+ * CPU. The value to retrieve is indicated by *flags*, that
+ * contains the index of the CPU to look up, masked with
+ * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to
+ * **BPF_F_CURRENT_CPU** to indicate that the value for the
+ * current CPU should be retrieved.
+ *
+ * This helper behaves in a way close to
+ * **bpf_perf_event_read**\ () helper, save that instead of
+ * just returning the value observed, it fills the *buf*
+ * structure. This allows for additional data to be retrieved: in
+ * particular, the enabled and running times (in *buf*\
+ * **->enabled** and *buf*\ **->running**, respectively) are
+ * copied. In general, **bpf_perf_event_read_value**\ () is
+ * recommended over **bpf_perf_event_read**\ (), which has some
+ * ABI issues and provides fewer functionalities.
+ *
+ * These values are interesting, because hardware PMU (Performance
+ * Monitoring Unit) counters are limited resources. When there are
+ * more PMU based perf events opened than available counters,
+ * kernel will multiplex these events so each event gets certain
+ * percentage (but not all) of the PMU time. In case that
+ * multiplexing happens, the number of samples or counter value
+ * will not reflect the case compared to when no multiplexing
+ * occurs. This makes comparison between different runs difficult.
+ * Typically, the counter value should be normalized before
+ * comparing to other experiments. The usual normalization is done
+ * as follows.
+ *
+ * ::
+ *
+ * normalized_counter = counter * t_enabled / t_running
+ *
+ * Where t_enabled is the time enabled for event and t_running is
+ * the time running for event since last normalization. The
+ * enabled and running times are accumulated since the perf event
+ * open. To achieve scaling factor between two invocations of an
+ * eBPF program, users can use CPU id as the key (which is
+ * typical for perf array usage model) to remember the previous
+ * value and do the calculation inside the eBPF program.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size)
+ * Description
+ * For en eBPF program attached to a perf event, retrieve the
+ * value of the event counter associated to *ctx* and store it in
+ * the structure pointed by *buf* and of size *buf_size*. Enabled
+ * and running times are also stored in the structure (see
+ * description of helper **bpf_perf_event_read_value**\ () for
+ * more details).
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen)
+ * Description
+ * Emulate a call to **getsockopt()** on the socket associated to
+ * *bpf_socket*, which must be a full socket. The *level* at
+ * which the option resides and the name *optname* of the option
+ * must be specified, see **getsockopt(2)** for more information.
+ * The retrieved value is stored in the structure pointed by
+ * *opval* and of length *optlen*.
+ *
+ * *bpf_socket* should be one of the following:
+ *
+ * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
+ * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
+ * and **BPF_CGROUP_INET6_CONNECT**.
+ *
+ * This helper actually implements a subset of **getsockopt()**.
+ * It supports the same set of *optname*\ s that is supported by
+ * the **bpf_setsockopt**\ () helper. The exceptions are
+ * **TCP_BPF_*** is **bpf_setsockopt**\ () only and
+ * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_override_return(struct pt_regs *regs, u64 rc)
+ * Description
+ * Used for error injection, this helper uses kprobes to override
+ * the return value of the probed function, and to set it to *rc*.
+ * The first argument is the context *regs* on which the kprobe
+ * works.
+ *
+ * This helper works by setting the PC (program counter)
+ * to an override function which is run in place of the original
+ * probed function. This means the probed function is not run at
+ * all. The replacement function just returns with the required
+ * value.
+ *
+ * This helper has security implications, and thus is subject to
+ * restrictions. It is only available if the kernel was compiled
+ * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration
+ * option, and in this case it only works on functions tagged with
+ * **ALLOW_ERROR_INJECTION** in the kernel code.
+ *
+ * Also, the helper is only available for the architectures having
+ * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing,
+ * x86 architecture is the only one to support this feature.
+ * Return
+ * 0
+ *
+ * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval)
+ * Description
+ * Attempt to set the value of the **bpf_sock_ops_cb_flags** field
+ * for the full TCP socket associated to *bpf_sock_ops* to
+ * *argval*.
+ *
+ * The primary use of this field is to determine if there should
+ * be calls to eBPF programs of type
+ * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP
+ * code. A program of the same type can change its value, per
+ * connection and as necessary, when the connection is
+ * established. This field is directly accessible for reading, but
+ * this helper must be used for updates in order to return an
+ * error if an eBPF program tries to set a callback that is not
+ * supported in the current kernel.
+ *
+ * *argval* is a flag array which can combine these flags:
+ *
+ * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
+ * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
+ * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
+ * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
+ *
+ * Therefore, this function can be used to clear a callback flag by
+ * setting the appropriate bit to zero. e.g. to disable the RTO
+ * callback:
+ *
+ * **bpf_sock_ops_cb_flags_set(bpf_sock,**
+ * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)**
+ *
+ * Here are some examples of where one could call such eBPF
+ * program:
+ *
+ * * When RTO fires.
+ * * When a packet is retransmitted.
+ * * When the connection terminates.
+ * * When a packet is sent.
+ * * When a packet is received.
+ * Return
+ * Code **-EINVAL** if the socket is not a full TCP socket;
+ * otherwise, a positive number containing the bits that could not
+ * be set is returned (which comes down to 0 if all bits were set
+ * as required).
+ *
+ * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags)
+ * Description
+ * This helper is used in programs implementing policies at the
+ * socket level. If the message *msg* is allowed to pass (i.e. if
+ * the verdict eBPF program returns **SK_PASS**), redirect it to
+ * the socket referenced by *map* (of type
+ * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and
+ * egress interfaces can be used for redirection. The
+ * **BPF_F_INGRESS** value in *flags* is used to make the
+ * distinction (ingress path is selected if the flag is present,
+ * egress path otherwise). This is the only flag supported for now.
+ * Return
+ * **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * Description
+ * For socket policies, apply the verdict of the eBPF program to
+ * the next *bytes* (number of bytes) of message *msg*.
+ *
+ * For example, this helper can be used in the following cases:
+ *
+ * * A single **sendmsg**\ () or **sendfile**\ () system call
+ * contains multiple logical messages that the eBPF program is
+ * supposed to read and for which it should apply a verdict.
+ * * An eBPF program only cares to read the first *bytes* of a
+ * *msg*. If the message has a large payload, then setting up
+ * and calling the eBPF program repeatedly for all bytes, even
+ * though the verdict is already known, would create unnecessary
+ * overhead.
+ *
+ * When called from within an eBPF program, the helper sets a
+ * counter internal to the BPF infrastructure, that is used to
+ * apply the last verdict to the next *bytes*. If *bytes* is
+ * smaller than the current data being processed from a
+ * **sendmsg**\ () or **sendfile**\ () system call, the first
+ * *bytes* will be sent and the eBPF program will be re-run with
+ * the pointer for start of data pointing to byte number *bytes*
+ * **+ 1**. If *bytes* is larger than the current data being
+ * processed, then the eBPF verdict will be applied to multiple
+ * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are
+ * consumed.
+ *
+ * Note that if a socket closes with the internal counter holding
+ * a non-zero value, this is not a problem because data is not
+ * being buffered for *bytes* and is sent as it is received.
+ * Return
+ * 0
+ *
+ * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes)
+ * Description
+ * For socket policies, prevent the execution of the verdict eBPF
+ * program for message *msg* until *bytes* (byte number) have been
+ * accumulated.
+ *
+ * This can be used when one needs a specific number of bytes
+ * before a verdict can be assigned, even if the data spans
+ * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme
+ * case would be a user calling **sendmsg**\ () repeatedly with
+ * 1-byte long message segments. Obviously, this is bad for
+ * performance, but it is still valid. If the eBPF program needs
+ * *bytes* bytes to validate a header, this helper can be used to
+ * prevent the eBPF program to be called again until *bytes* have
+ * been accumulated.
+ * Return
+ * 0
+ *
+ * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags)
+ * Description
+ * For socket policies, pull in non-linear data from user space
+ * for *msg* and set pointers *msg*\ **->data** and *msg*\
+ * **->data_end** to *start* and *end* bytes offsets into *msg*,
+ * respectively.
+ *
+ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * *msg* it can only parse data that the (**data**, **data_end**)
+ * pointers have already consumed. For **sendmsg**\ () hooks this
+ * is likely the first scatterlist element. But for calls relying
+ * on the **sendpage** handler (e.g. **sendfile**\ ()) this will
+ * be the range (**0**, **0**) because the data is shared with
+ * user space and by default the objective is to avoid allowing
+ * user space to modify data while (or after) eBPF verdict is
+ * being decided. This helper can be used to pull in data and to
+ * set the start and end pointer to given values. Data will be
+ * copied if necessary (i.e. if data was not linear and if start
+ * and end pointers do not point to the same chunk).
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len)
+ * Description
+ * Bind the socket associated to *ctx* to the address pointed by
+ * *addr*, of length *addr_len*. This allows for making outgoing
+ * connection from the desired IP address, which can be useful for
+ * example when all processes inside a cgroup should use one
+ * single IP address on a host that has multiple IP configured.
+ *
+ * This helper works for IPv4 and IPv6, TCP and UDP sockets. The
+ * domain (*addr*\ **->sa_family**) must be **AF_INET** (or
+ * **AF_INET6**). It's advised to pass zero port (**sin_port**
+ * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
+ * behavior and lets the kernel efficiently pick up an unused
+ * port as long as 4-tuple is unique. Passing non-zero port might
+ * lead to degraded performance.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
+ * Description
+ * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
+ * possible to both shrink and grow the packet tail.
+ * Shrink done via *delta* being a negative integer.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags)
+ * Description
+ * Retrieve the XFRM state (IP transform framework, see also
+ * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*.
+ *
+ * The retrieved value is stored in the **struct bpf_xfrm_state**
+ * pointed by *xfrm_state* and of length *size*.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * This helper is available only if the kernel was compiled with
+ * **CONFIG_XFRM** configuration option.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags)
+ * Description
+ * Return a user or a kernel stack in bpf program provided buffer.
+ * To achieve this, the helper needs *ctx*, which is a pointer
+ * to the context on which the tracing program is executed.
+ * To store the stacktrace, the bpf program provides *buf* with
+ * a nonnegative *size*.
+ *
+ * The last argument, *flags*, holds the number of stack frames to
+ * skip (from 0 to 255), masked with
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * the following flags:
+ *
+ * **BPF_F_USER_STACK**
+ * Collect a user space stack instead of a kernel stack.
+ * **BPF_F_USER_BUILD_ID**
+ * Collect (build_id, file_offset) instead of ips for user
+ * stack, only valid if **BPF_F_USER_STACK** is also
+ * specified.
+ *
+ * *file_offset* is an offset relative to the beginning
+ * of the executable or shared object file backing the vma
+ * which the *ip* falls in. It is *not* an offset relative
+ * to that object's base address. Accordingly, it must be
+ * adjusted by adding (sh_addr - sh_offset), where
+ * sh_{addr,offset} correspond to the executable section
+ * containing *file_offset* in the object, for comparisons
+ * to symbols' st_value to be valid.
+ *
+ * **bpf_get_stack**\ () can collect up to
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * to sufficient large buffer size. Note that
+ * this limit can be controlled with the **sysctl** program, and
+ * that it should be manually increased in order to profile long
+ * user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * ::
+ *
+ * # sysctl kernel.perf_event_max_stack=<new value>
+ * Return
+ * The non-negative copied *buf* length equal to or less than
+ * *size* on success, or a negative error in case of failure.
+ *
+ * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header)
+ * Description
+ * This helper is similar to **bpf_skb_load_bytes**\ () in that
+ * it provides an easy way to load *len* bytes from *offset*
+ * from the packet associated to *skb*, into the buffer pointed
+ * by *to*. The difference to **bpf_skb_load_bytes**\ () is that
+ * a fifth argument *start_header* exists in order to select a
+ * base offset to start from. *start_header* can be one of:
+ *
+ * **BPF_HDR_START_MAC**
+ * Base offset to load data from is *skb*'s mac header.
+ * **BPF_HDR_START_NET**
+ * Base offset to load data from is *skb*'s network header.
+ *
+ * In general, "direct packet access" is the preferred method to
+ * access packet data, however, this helper is in particular useful
+ * in socket filters where *skb*\ **->data** does not always point
+ * to the start of the mac header and where "direct packet access"
+ * is not available.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
+ * Description
+ * Do FIB lookup in kernel tables using parameters in *params*.
+ * If lookup is successful and result shows packet is to be
+ * forwarded, the neighbor tables are searched for the nexthop.
+ * If successful (ie., FIB lookup shows forwarding and nexthop
+ * is resolved), the nexthop address is returned in ipv4_dst
+ * or ipv6_dst based on family, smac is set to mac address of
+ * egress device, dmac is set to nexthop mac address, rt_metric
+ * is set to metric from route (IPv4/IPv6 only), and ifindex
+ * is set to the device index of the nexthop from the FIB lookup.
+ *
+ * *plen* argument is the size of the passed in struct.
+ * *flags* argument can be a combination of one or more of the
+ * following values:
+ *
+ * **BPF_FIB_LOOKUP_DIRECT**
+ * Do a direct table lookup vs full lookup using FIB
+ * rules.
+ * **BPF_FIB_LOOKUP_OUTPUT**
+ * Perform lookup from an egress perspective (default is
+ * ingress).
+ *
+ * *ctx* is either **struct xdp_md** for XDP programs or
+ * **struct sk_buff** tc cls_act programs.
+ * Return
+ * * < 0 if any input argument is invalid
+ * * 0 on success (packet is forwarded, nexthop neighbor exists)
+ * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the
+ * packet is not forwarded or needs assist from full stack
+ *
+ * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU
+ * was exceeded and output params->mtu_result contains the MTU.
+ *
+ * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * Add an entry to, or update a sockhash *map* referencing sockets.
+ * The *skops* is used as a new value for the entry associated to
+ * *key*. *flags* is one of:
+ *
+ * **BPF_NOEXIST**
+ * The entry for *key* must not exist in the map.
+ * **BPF_EXIST**
+ * The entry for *key* must already exist in the map.
+ * **BPF_ANY**
+ * No condition on the existence of the entry for *key*.
+ *
+ * If the *map* has eBPF programs (parser and verdict), those will
+ * be inherited by the socket being added. If the socket is
+ * already attached to eBPF programs, this results in an error.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * This helper is used in programs implementing policies at the
+ * socket level. If the message *msg* is allowed to pass (i.e. if
+ * the verdict eBPF program returns **SK_PASS**), redirect it to
+ * the socket referenced by *map* (of type
+ * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * egress interfaces can be used for redirection. The
+ * **BPF_F_INGRESS** value in *flags* is used to make the
+ * distinction (ingress path is selected if the flag is present,
+ * egress path otherwise). This is the only flag supported for now.
+ * Return
+ * **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * This helper is used in programs implementing policies at the
+ * skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
+ * if the verdict eBPF program returns **SK_PASS**), redirect it
+ * to the socket referenced by *map* (of type
+ * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
+ * egress interfaces can be used for redirection. The
+ * **BPF_F_INGRESS** value in *flags* is used to make the
+ * distinction (ingress path is selected if the flag is present,
+ * egress otherwise). This is the only flag supported for now.
+ * Return
+ * **SK_PASS** on success, or **SK_DROP** on error.
+ *
+ * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
+ * Description
+ * Encapsulate the packet associated to *skb* within a Layer 3
+ * protocol header. This header is provided in the buffer at
+ * address *hdr*, with *len* its size in bytes. *type* indicates
+ * the protocol of the header and can be one of:
+ *
+ * **BPF_LWT_ENCAP_SEG6**
+ * IPv6 encapsulation with Segment Routing Header
+ * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
+ * the IPv6 header is computed by the kernel.
+ * **BPF_LWT_ENCAP_SEG6_INLINE**
+ * Only works if *skb* contains an IPv6 packet. Insert a
+ * Segment Routing Header (**struct ipv6_sr_hdr**) inside
+ * the IPv6 header.
+ * **BPF_LWT_ENCAP_IP**
+ * IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ * must be IPv4 or IPv6, followed by zero or more
+ * additional headers, up to **LWT_BPF_MAX_HEADROOM**
+ * total bytes in all prepended headers. Please note that
+ * if **skb_is_gso**\ (*skb*) is true, no more than two
+ * headers can be prepended, and the inner header, if
+ * present, should be either GRE or UDP/GUE.
+ *
+ * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs
+ * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can
+ * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and
+ * **BPF_PROG_TYPE_LWT_XMIT**.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len)
+ * Description
+ * Store *len* bytes from address *from* into the packet
+ * associated to *skb*, at *offset*. Only the flags, tag and TLVs
+ * inside the outermost IPv6 Segment Routing Header can be
+ * modified through this helper.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
+ * Description
+ * Adjust the size allocated to TLVs in the outermost IPv6
+ * Segment Routing Header contained in the packet associated to
+ * *skb*, at position *offset* by *delta* bytes. Only offsets
+ * after the segments are accepted. *delta* can be as well
+ * positive (growing) as negative (shrinking).
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len)
+ * Description
+ * Apply an IPv6 Segment Routing action of type *action* to the
+ * packet associated to *skb*. Each action takes a parameter
+ * contained at address *param*, and of length *param_len* bytes.
+ * *action* can be one of:
+ *
+ * **SEG6_LOCAL_ACTION_END_X**
+ * End.X action: Endpoint with Layer-3 cross-connect.
+ * Type of *param*: **struct in6_addr**.
+ * **SEG6_LOCAL_ACTION_END_T**
+ * End.T action: Endpoint with specific IPv6 table lookup.
+ * Type of *param*: **int**.
+ * **SEG6_LOCAL_ACTION_END_B6**
+ * End.B6 action: Endpoint bound to an SRv6 policy.
+ * Type of *param*: **struct ipv6_sr_hdr**.
+ * **SEG6_LOCAL_ACTION_END_B6_ENCAP**
+ * End.B6.Encap action: Endpoint bound to an SRv6
+ * encapsulation policy.
+ * Type of *param*: **struct ipv6_sr_hdr**.
+ *
+ * A call to this helper is susceptible to change the underlying
+ * packet buffer. Therefore, at load time, all checks on pointers
+ * previously done by the verifier are invalidated and must be
+ * performed again, if the helper is used in combination with
+ * direct packet access.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_rc_repeat(void *ctx)
+ * Description
+ * This helper is used in programs implementing IR decoding, to
+ * report a successfully decoded repeat key message. This delays
+ * the generation of a key up event for previously generated
+ * key down event.
+ *
+ * Some IR protocols like NEC have a special IR message for
+ * repeating last button, for when a button is held down.
+ *
+ * The *ctx* should point to the lirc sample as passed into
+ * the program.
+ *
+ * This helper is only available is the kernel was compiled with
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * "**y**".
+ * Return
+ * 0
+ *
+ * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
+ * Description
+ * This helper is used in programs implementing IR decoding, to
+ * report a successfully decoded key press with *scancode*,
+ * *toggle* value in the given *protocol*. The scancode will be
+ * translated to a keycode using the rc keymap, and reported as
+ * an input key down event. After a period a key up event is
+ * generated. This period can be extended by calling either
+ * **bpf_rc_keydown**\ () again with the same values, or calling
+ * **bpf_rc_repeat**\ ().
+ *
+ * Some protocols include a toggle bit, in case the button was
+ * released and pressed again between consecutive scancodes.
+ *
+ * The *ctx* should point to the lirc sample as passed into
+ * the program.
+ *
+ * The *protocol* is the decoded protocol number (see
+ * **enum rc_proto** for some predefined values).
+ *
+ * This helper is only available is the kernel was compiled with
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * "**y**".
+ * Return
+ * 0
+ *
+ * u64 bpf_skb_cgroup_id(struct sk_buff *skb)
+ * Description
+ * Return the cgroup v2 id of the socket associated with the *skb*.
+ * This is roughly similar to the **bpf_get_cgroup_classid**\ ()
+ * helper for cgroup v1 by providing a tag resp. identifier that
+ * can be matched on or used for map lookups e.g. to implement
+ * policy. The cgroup v2 id of a given path in the hierarchy is
+ * exposed in user space through the f_handle API in order to get
+ * to the same 64-bit id.
+ *
+ * This helper can be used on TC egress path, but not on ingress,
+ * and is available only if the kernel was compiled with the
+ * **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_get_current_cgroup_id(void)
+ * Description
+ * Get the current cgroup id based on the cgroup within which
+ * the current task is running.
+ * Return
+ * A 64-bit integer containing the current cgroup id based
+ * on the cgroup within which the current task is running.
+ *
+ * void *bpf_get_local_storage(void *map, u64 flags)
+ * Description
+ * Get the pointer to the local storage area.
+ * The type and the size of the local storage is defined
+ * by the *map* argument.
+ * The *flags* meaning is specific for each map type,
+ * and has to be 0 for cgroup local storage.
+ *
+ * Depending on the BPF program type, a local storage area
+ * can be shared between multiple instances of the BPF program,
+ * running simultaneously.
+ *
+ * A user should care about the synchronization by himself.
+ * For example, by using the **BPF_ATOMIC** instructions to alter
+ * the shared data.
+ * Return
+ * A pointer to the local storage area.
+ *
+ * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags)
+ * Description
+ * Select a **SO_REUSEPORT** socket from a
+ * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*.
+ * It checks the selected socket is matching the incoming
+ * request in the socket buffer.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level)
+ * Description
+ * Return id of cgroup v2 that is ancestor of cgroup associated
+ * with the *skb* at the *ancestor_level*. The root cgroup is at
+ * *ancestor_level* zero and each step down the hierarchy
+ * increments the level. If *ancestor_level* == level of cgroup
+ * associated with *skb*, then return value will be same as that
+ * of **bpf_skb_cgroup_id**\ ().
+ *
+ * The helper is useful to implement policies based on cgroups
+ * that are upper in hierarchy than immediate cgroup associated
+ * with *skb*.
+ *
+ * The format of returned id and helper limitations are same as in
+ * **bpf_skb_cgroup_id**\ ().
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ * Description
+ * Look for TCP socket matching *tuple*, optionally in a child
+ * network namespace *netns*. The return value must be checked,
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * The *ctx* should point to the context of the program, such as
+ * the skb or socket (depending on the hook in use). This is used
+ * to determine the base network namespace for the lookup.
+ *
+ * *tuple_size* must be one of:
+ *
+ * **sizeof**\ (*tuple*\ **->ipv4**)
+ * Look for an IPv4 socket.
+ * **sizeof**\ (*tuple*\ **->ipv6**)
+ * Look for an IPv6 socket.
+ *
+ * If the *netns* is a negative signed 32-bit integer, then the
+ * socket lookup table in the netns associated with the *ctx*
+ * will be used. For the TC hooks, this is the netns of the device
+ * in the skb. For socket hooks, this is the netns of the socket.
+ * If *netns* is any other signed 32-bit value greater than or
+ * equal to zero then it specifies the ID of the netns relative to
+ * the netns associated with the *ctx*. *netns* values beyond the
+ * range of 32-bit integers are reserved for future use.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * This helper is available only if the kernel was compiled with
+ * **CONFIG_NET** configuration option.
+ * Return
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * For sockets with reuseport option, the **struct bpf_sock**
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
+ * tuple.
+ *
+ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ * Description
+ * Look for UDP socket matching *tuple*, optionally in a child
+ * network namespace *netns*. The return value must be checked,
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * The *ctx* should point to the context of the program, such as
+ * the skb or socket (depending on the hook in use). This is used
+ * to determine the base network namespace for the lookup.
+ *
+ * *tuple_size* must be one of:
+ *
+ * **sizeof**\ (*tuple*\ **->ipv4**)
+ * Look for an IPv4 socket.
+ * **sizeof**\ (*tuple*\ **->ipv6**)
+ * Look for an IPv6 socket.
+ *
+ * If the *netns* is a negative signed 32-bit integer, then the
+ * socket lookup table in the netns associated with the *ctx*
+ * will be used. For the TC hooks, this is the netns of the device
+ * in the skb. For socket hooks, this is the netns of the socket.
+ * If *netns* is any other signed 32-bit value greater than or
+ * equal to zero then it specifies the ID of the netns relative to
+ * the netns associated with the *ctx*. *netns* values beyond the
+ * range of 32-bit integers are reserved for future use.
+ *
+ * All values for *flags* are reserved for future usage, and must
+ * be left at zero.
+ *
+ * This helper is available only if the kernel was compiled with
+ * **CONFIG_NET** configuration option.
+ * Return
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * For sockets with reuseport option, the **struct bpf_sock**
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
+ * tuple.
+ *
+ * long bpf_sk_release(void *sock)
+ * Description
+ * Release the reference held by *sock*. *sock* must be a
+ * non-**NULL** pointer that was returned from
+ * **bpf_sk_lookup_xxx**\ ().
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags)
+ * Description
+ * Push an element *value* in *map*. *flags* is one of:
+ *
+ * **BPF_EXIST**
+ * If the queue/stack is full, the oldest element is
+ * removed to make room for this.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_map_pop_elem(struct bpf_map *map, void *value)
+ * Description
+ * Pop an element from *map*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_map_peek_elem(struct bpf_map *map, void *value)
+ * Description
+ * Get an element from *map* without removing it.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
+ * Description
+ * For socket policies, insert *len* bytes into *msg* at offset
+ * *start*.
+ *
+ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a
+ * *msg* it may want to insert metadata or options into the *msg*.
+ * This can later be read and used by any of the lower layer BPF
+ * hooks.
+ *
+ * This helper may fail if under memory pressure (a malloc
+ * fails) in these cases BPF programs will get an appropriate
+ * error and BPF programs will need to handle them.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags)
+ * Description
+ * Will remove *len* bytes from a *msg* starting at byte *start*.
+ * This may result in **ENOMEM** errors under certain situations if
+ * an allocation and copy are required due to a full ring buffer.
+ * However, the helper will try to avoid doing the allocation
+ * if possible. Other errors can occur if input parameters are
+ * invalid either due to *start* byte not being valid part of *msg*
+ * payload and/or *pop* value being to large.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y)
+ * Description
+ * This helper is used in programs implementing IR decoding, to
+ * report a successfully decoded pointer movement.
+ *
+ * The *ctx* should point to the lirc sample as passed into
+ * the program.
+ *
+ * This helper is only available is the kernel was compiled with
+ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
+ * "**y**".
+ * Return
+ * 0
+ *
+ * long bpf_spin_lock(struct bpf_spin_lock *lock)
+ * Description
+ * Acquire a spinlock represented by the pointer *lock*, which is
+ * stored as part of a value of a map. Taking the lock allows to
+ * safely update the rest of the fields in that value. The
+ * spinlock can (and must) later be released with a call to
+ * **bpf_spin_unlock**\ (\ *lock*\ ).
+ *
+ * Spinlocks in BPF programs come with a number of restrictions
+ * and constraints:
+ *
+ * * **bpf_spin_lock** objects are only allowed inside maps of
+ * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this
+ * list could be extended in the future).
+ * * BTF description of the map is mandatory.
+ * * The BPF program can take ONE lock at a time, since taking two
+ * or more could cause dead locks.
+ * * Only one **struct bpf_spin_lock** is allowed per map element.
+ * * When the lock is taken, calls (either BPF to BPF or helpers)
+ * are not allowed.
+ * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not
+ * allowed inside a spinlock-ed region.
+ * * The BPF program MUST call **bpf_spin_unlock**\ () to release
+ * the lock, on all execution paths, before it returns.
+ * * The BPF program can access **struct bpf_spin_lock** only via
+ * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ ()
+ * helpers. Loading or storing data into the **struct
+ * bpf_spin_lock** *lock*\ **;** field of a map is not allowed.
+ * * To use the **bpf_spin_lock**\ () helper, the BTF description
+ * of the map value must be a struct and have **struct
+ * bpf_spin_lock** *anyname*\ **;** field at the top level.
+ * Nested lock inside another struct is not allowed.
+ * * The **struct bpf_spin_lock** *lock* field in a map value must
+ * be aligned on a multiple of 4 bytes in that value.
+ * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy
+ * the **bpf_spin_lock** field to user space.
+ * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from
+ * a BPF program, do not update the **bpf_spin_lock** field.
+ * * **bpf_spin_lock** cannot be on the stack or inside a
+ * networking packet (it can only be inside of a map values).
+ * * **bpf_spin_lock** is available to root only.
+ * * Tracing programs and socket filter programs cannot use
+ * **bpf_spin_lock**\ () due to insufficient preemption checks
+ * (but this may change in the future).
+ * * **bpf_spin_lock** is not allowed in inner maps of map-in-map.
+ * Return
+ * 0
+ *
+ * long bpf_spin_unlock(struct bpf_spin_lock *lock)
+ * Description
+ * Release the *lock* previously locked by a call to
+ * **bpf_spin_lock**\ (\ *lock*\ ).
+ * Return
+ * 0
+ *
+ * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
+ * Description
+ * This helper gets a **struct bpf_sock** pointer such
+ * that all the fields in this **bpf_sock** can be accessed.
+ * Return
+ * A **struct bpf_sock** pointer on success, or **NULL** in
+ * case of failure.
+ *
+ * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
+ * Description
+ * This helper gets a **struct bpf_tcp_sock** pointer from a
+ * **struct bpf_sock** pointer.
+ * Return
+ * A **struct bpf_tcp_sock** pointer on success, or **NULL** in
+ * case of failure.
+ *
+ * long bpf_skb_ecn_set_ce(struct sk_buff *skb)
+ * Description
+ * Set ECN (Explicit Congestion Notification) field of IP header
+ * to **CE** (Congestion Encountered) if current value is **ECT**
+ * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6
+ * and IPv4.
+ * Return
+ * 1 if the **CE** flag is set (either by the current helper call
+ * or because it was already present), 0 if it is not set.
+ *
+ * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk)
+ * Description
+ * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state.
+ * **bpf_sk_release**\ () is unnecessary and not allowed.
+ * Return
+ * A **struct bpf_sock** pointer on success, or **NULL** in
+ * case of failure.
+ *
+ * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags)
+ * Description
+ * Look for TCP socket matching *tuple*, optionally in a child
+ * network namespace *netns*. The return value must be checked,
+ * and if non-**NULL**, released via **bpf_sk_release**\ ().
+ *
+ * This function is identical to **bpf_sk_lookup_tcp**\ (), except
+ * that it also returns timewait or request sockets. Use
+ * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the
+ * full structure.
+ *
+ * This helper is available only if the kernel was compiled with
+ * **CONFIG_NET** configuration option.
+ * Return
+ * Pointer to **struct bpf_sock**, or **NULL** in case of failure.
+ * For sockets with reuseport option, the **struct bpf_sock**
+ * result is from *reuse*\ **->socks**\ [] using the hash of the
+ * tuple.
+ *
+ * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
+ * Description
+ * Check whether *iph* and *th* contain a valid SYN cookie ACK for
+ * the listening socket in *sk*.
+ *
+ * *iph* points to the start of the IPv4 or IPv6 header, while
+ * *iph_len* contains **sizeof**\ (**struct iphdr**) or
+ * **sizeof**\ (**struct ipv6hdr**).
+ *
+ * *th* points to the start of the TCP header, while *th_len*
+ * contains the length of the TCP header (at least
+ * **sizeof**\ (**struct tcphdr**)).
+ * Return
+ * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
+ * error otherwise.
+ *
+ * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags)
+ * Description
+ * Get name of sysctl in /proc/sys/ and copy it into provided by
+ * program buffer *buf* of size *buf_len*.
+ *
+ * The buffer is always NUL terminated, unless it's zero-sized.
+ *
+ * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is
+ * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name
+ * only (e.g. "tcp_mem").
+ * Return
+ * Number of character copied (not including the trailing NUL).
+ *
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * truncated name in this case).
+ *
+ * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ * Description
+ * Get current value of sysctl as it is presented in /proc/sys
+ * (incl. newline, etc), and copy it as a string into provided
+ * by program buffer *buf* of size *buf_len*.
+ *
+ * The whole value is copied, no matter what file position user
+ * space issued e.g. sys_read at.
+ *
+ * The buffer is always NUL terminated, unless it's zero-sized.
+ * Return
+ * Number of character copied (not including the trailing NUL).
+ *
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * truncated name in this case).
+ *
+ * **-EINVAL** if current value was unavailable, e.g. because
+ * sysctl is uninitialized and read returns -EIO for it.
+ *
+ * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ * Description
+ * Get new value being written by user space to sysctl (before
+ * the actual write happens) and copy it as a string into
+ * provided by program buffer *buf* of size *buf_len*.
+ *
+ * User space may write new value at file position > 0.
+ *
+ * The buffer is always NUL terminated, unless it's zero-sized.
+ * Return
+ * Number of character copied (not including the trailing NUL).
+ *
+ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ * truncated name in this case).
+ *
+ * **-EINVAL** if sysctl is being read.
+ *
+ * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
+ * Description
+ * Override new value being written by user space to sysctl with
+ * value provided by program in buffer *buf* of size *buf_len*.
+ *
+ * *buf* should contain a string in same form as provided by user
+ * space on sysctl write.
+ *
+ * User space may write new value at file position > 0. To override
+ * the whole sysctl value file position should be set to zero.
+ * Return
+ * 0 on success.
+ *
+ * **-E2BIG** if the *buf_len* is too big.
+ *
+ * **-EINVAL** if sysctl is being read.
+ *
+ * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res)
+ * Description
+ * Convert the initial part of the string from buffer *buf* of
+ * size *buf_len* to a long integer according to the given base
+ * and save the result in *res*.
+ *
+ * The string may begin with an arbitrary amount of white space
+ * (as determined by **isspace**\ (3)) followed by a single
+ * optional '**-**' sign.
+ *
+ * Five least significant bits of *flags* encode base, other bits
+ * are currently unused.
+ *
+ * Base must be either 8, 10, 16 or 0 to detect it automatically
+ * similar to user space **strtol**\ (3).
+ * Return
+ * Number of characters consumed on success. Must be positive but
+ * no more than *buf_len*.
+ *
+ * **-EINVAL** if no valid digits were found or unsupported base
+ * was provided.
+ *
+ * **-ERANGE** if resulting value was out of range.
+ *
+ * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res)
+ * Description
+ * Convert the initial part of the string from buffer *buf* of
+ * size *buf_len* to an unsigned long integer according to the
+ * given base and save the result in *res*.
+ *
+ * The string may begin with an arbitrary amount of white space
+ * (as determined by **isspace**\ (3)).
+ *
+ * Five least significant bits of *flags* encode base, other bits
+ * are currently unused.
+ *
+ * Base must be either 8, 10, 16 or 0 to detect it automatically
+ * similar to user space **strtoul**\ (3).
+ * Return
+ * Number of characters consumed on success. Must be positive but
+ * no more than *buf_len*.
+ *
+ * **-EINVAL** if no valid digits were found or unsupported base
+ * was provided.
+ *
+ * **-ERANGE** if resulting value was out of range.
+ *
+ * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags)
+ * Description
+ * Get a bpf-local-storage from a *sk*.
+ *
+ * Logically, it could be thought of getting the value from
+ * a *map* with *sk* as the **key**. From this
+ * perspective, the usage is not much different from
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this
+ * helper enforces the key must be a full socket and the map must
+ * be a **BPF_MAP_TYPE_SK_STORAGE** also.
+ *
+ * Underneath, the value is stored locally at *sk* instead of
+ * the *map*. The *map* is used as the bpf-local-storage
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
+ * searched against all bpf-local-storages residing at *sk*.
+ *
+ * *sk* is a kernel **struct sock** pointer for LSM program.
+ * *sk* is a **struct bpf_sock** pointer for other program types.
+ *
+ * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be
+ * used such that a new bpf-local-storage will be
+ * created if one does not exist. *value* can be used
+ * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify
+ * the initial value of a bpf-local-storage. If *value* is
+ * **NULL**, the new bpf-local-storage will be zero initialized.
+ * Return
+ * A bpf-local-storage pointer is returned on success.
+ *
+ * **NULL** if not found or there was an error in adding
+ * a new bpf-local-storage.
+ *
+ * long bpf_sk_storage_delete(struct bpf_map *map, void *sk)
+ * Description
+ * Delete a bpf-local-storage from a *sk*.
+ * Return
+ * 0 on success.
+ *
+ * **-ENOENT** if the bpf-local-storage cannot be found.
+ * **-EINVAL** if sk is not a fullsock (e.g. a request_sock).
+ *
+ * long bpf_send_signal(u32 sig)
+ * Description
+ * Send signal *sig* to the process of the current task.
+ * The signal may be delivered to any of this process's threads.
+ * Return
+ * 0 on success or successfully queued.
+ *
+ * **-EBUSY** if work queue under nmi is full.
+ *
+ * **-EINVAL** if *sig* is invalid.
+ *
+ * **-EPERM** if no permission to send the *sig*.
+ *
+ * **-EAGAIN** if bpf program can try again.
+ *
+ * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len)
+ * Description
+ * Try to issue a SYN cookie for the packet with corresponding
+ * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*.
+ *
+ * *iph* points to the start of the IPv4 or IPv6 header, while
+ * *iph_len* contains **sizeof**\ (**struct iphdr**) or
+ * **sizeof**\ (**struct ipv6hdr**).
+ *
+ * *th* points to the start of the TCP header, while *th_len*
+ * contains the length of the TCP header with options (at least
+ * **sizeof**\ (**struct tcphdr**)).
+ * Return
+ * On success, lower 32 bits hold the generated SYN cookie in
+ * followed by 16 bits which hold the MSS value for that cookie,
+ * and the top 16 bits are unused.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EINVAL** SYN cookie cannot be issued due to error
+ *
+ * **-ENOENT** SYN cookie should not be issued (no SYN flood)
+ *
+ * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
+ *
+ * **-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * *ctx* is a pointer to in-kernel struct sk_buff.
+ *
+ * This helper is similar to **bpf_perf_event_output**\ () but
+ * restricted to raw_tracepoint bpf programs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Safely attempt to read *size* bytes from user space address
+ * *unsafe_ptr* and store the data in *dst*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Safely attempt to read *size* bytes from kernel space address
+ * *unsafe_ptr* and store the data in *dst*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe user address
+ * *unsafe_ptr* to *dst*. The *size* should include the
+ * terminating NUL byte. In case the string length is smaller than
+ * *size*, the target is not padded with further NUL bytes. If the
+ * string length is larger than *size*, just *size*-1 bytes are
+ * copied and the last byte is set to NUL.
+ *
+ * On success, returns the number of bytes that were written,
+ * including the terminal NUL. This makes this helper useful in
+ * tracing programs for reading strings, and more importantly to
+ * get its length at runtime. See the following snippet:
+ *
+ * ::
+ *
+ * SEC("kprobe/sys_open")
+ * void bpf_sys_open(struct pt_regs *ctx)
+ * {
+ * char buf[PATHLEN]; // PATHLEN is defined to 256
+ * int res = bpf_probe_read_user_str(buf, sizeof(buf),
+ * ctx->di);
+ *
+ * // Consume buf, for example push it to
+ * // userspace via bpf_perf_event_output(); we
+ * // can use res (the string length) as event
+ * // size, after checking its boundaries.
+ * }
+ *
+ * In comparison, using **bpf_probe_read_user**\ () helper here
+ * instead to read the string would require to estimate the length
+ * at compile time, and would often result in copying more memory
+ * than necessary.
+ *
+ * Another useful use case is when parsing individual process
+ * arguments or individual environment variables navigating
+ * *current*\ **->mm->arg_start** and *current*\
+ * **->mm->env_start**: using this helper and the return value,
+ * one can quickly iterate at the right offset of the memory area.
+ * Return
+ * On success, the strictly positive length of the output string,
+ * including the trailing NUL character. On error, a negative
+ * value.
+ *
+ * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
+ * Description
+ * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
+ * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
+ * Return
+ * On success, the strictly positive length of the string, including
+ * the trailing NUL character. On error, a negative value.
+ *
+ * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
+ * Description
+ * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
+ * *rcv_nxt* is the ack_seq to be sent out.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_send_signal_thread(u32 sig)
+ * Description
+ * Send signal *sig* to the thread corresponding to the current task.
+ * Return
+ * 0 on success or successfully queued.
+ *
+ * **-EBUSY** if work queue under nmi is full.
+ *
+ * **-EINVAL** if *sig* is invalid.
+ *
+ * **-EPERM** if no permission to send the *sig*.
+ *
+ * **-EAGAIN** if bpf program can try again.
+ *
+ * u64 bpf_jiffies64(void)
+ * Description
+ * Obtain the 64-bit jiffies
+ * Return
+ * The 64 bit jiffies
+ *
+ * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
+ * Description
+ * For an eBPF program attached to a perf event, retrieve the
+ * branch records (**struct perf_branch_entry**) associated to *ctx*
+ * and store it in the buffer pointed by *buf* up to size
+ * *size* bytes.
+ * Return
+ * On success, number of bytes written to *buf*. On error, a
+ * negative value.
+ *
+ * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
+ * instead return the number of bytes required to store all the
+ * branch entries. If this flag is set, *buf* may be NULL.
+ *
+ * **-EINVAL** if arguments invalid or **size** not a multiple
+ * of **sizeof**\ (**struct perf_branch_entry**\ ).
+ *
+ * **-ENOENT** if architecture does not support branch records.
+ *
+ * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size)
+ * Description
+ * Returns 0 on success, values for *pid* and *tgid* as seen from the current
+ * *namespace* will be returned in *nsdata*.
+ * Return
+ * 0 on success, or one of the following in case of failure:
+ *
+ * **-EINVAL** if dev and inum supplied don't match dev_t and inode number
+ * with nsfs of current task, or if dev conversion to dev_t lost high bits.
+ *
+ * **-ENOENT** if pidns does not exists for the current task.
+ *
+ * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size)
+ * Description
+ * Write raw *data* blob into a special BPF perf event held by
+ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf
+ * event must have the following attributes: **PERF_SAMPLE_RAW**
+ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and
+ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**.
+ *
+ * The *flags* are used to indicate the index in *map* for which
+ * the value must be put, masked with **BPF_F_INDEX_MASK**.
+ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU**
+ * to indicate that the index of the current CPU core should be
+ * used.
+ *
+ * The value to write, of *size*, is passed through eBPF stack and
+ * pointed by *data*.
+ *
+ * *ctx* is a pointer to in-kernel struct xdp_buff.
+ *
+ * This helper is similar to **bpf_perf_eventoutput**\ () but
+ * restricted to raw_tracepoint bpf programs.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_get_netns_cookie(void *ctx)
+ * Description
+ * Retrieve the cookie (generated by the kernel) of the network
+ * namespace the input *ctx* is associated with. The network
+ * namespace cookie remains stable for its lifetime and provides
+ * a global identifier that can be assumed unique. If *ctx* is
+ * NULL, then the helper returns the cookie for the initial
+ * network namespace. The cookie itself is very similar to that
+ * of **bpf_get_socket_cookie**\ () helper, but for network
+ * namespaces instead of sockets.
+ * Return
+ * A 8-byte long opaque number.
+ *
+ * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level)
+ * Description
+ * Return id of cgroup v2 that is ancestor of the cgroup associated
+ * with the current task at the *ancestor_level*. The root cgroup
+ * is at *ancestor_level* zero and each step down the hierarchy
+ * increments the level. If *ancestor_level* == level of cgroup
+ * associated with the current task, then return value will be the
+ * same as that of **bpf_get_current_cgroup_id**\ ().
+ *
+ * The helper is useful to implement policies based on cgroups
+ * that are upper in hierarchy than immediate cgroup associated
+ * with the current task.
+ *
+ * The format of returned id and helper limitations are same as in
+ * **bpf_get_current_cgroup_id**\ ().
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags)
+ * Description
+ * Helper is overloaded depending on BPF program type. This
+ * description applies to **BPF_PROG_TYPE_SCHED_CLS** and
+ * **BPF_PROG_TYPE_SCHED_ACT** programs.
+ *
+ * Assign the *sk* to the *skb*. When combined with appropriate
+ * routing configuration to receive the packet towards the socket,
+ * will cause *skb* to be delivered to the specified socket.
+ * Subsequent redirection of *skb* via **bpf_redirect**\ (),
+ * **bpf_clone_redirect**\ () or other methods outside of BPF may
+ * interfere with successful delivery to the socket.
+ *
+ * This operation is only valid from TC ingress path.
+ *
+ * The *flags* argument must be zero.
+ * Return
+ * 0 on success, or a negative error in case of failure:
+ *
+ * **-EINVAL** if specified *flags* are not supported.
+ *
+ * **-ENOENT** if the socket is unavailable for assignment.
+ *
+ * **-ENETUNREACH** if the socket is unreachable (wrong netns).
+ *
+ * **-EOPNOTSUPP** if the operation is not supported, for example
+ * a call from outside of TC ingress.
+ *
+ * **-ESOCKTNOSUPPORT** if the socket type is not supported
+ * (reuseport).
+ *
+ * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
+ * Description
+ * Helper is overloaded depending on BPF program type. This
+ * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
+ *
+ * Select the *sk* as a result of a socket lookup.
+ *
+ * For the operation to succeed passed socket must be compatible
+ * with the packet description provided by the *ctx* object.
+ *
+ * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
+ * be an exact match. While IP family (**AF_INET** or
+ * **AF_INET6**) must be compatible, that is IPv6 sockets
+ * that are not v6-only can be selected for IPv4 packets.
+ *
+ * Only TCP listeners and UDP unconnected sockets can be
+ * selected. *sk* can also be NULL to reset any previous
+ * selection.
+ *
+ * *flags* argument can combination of following values:
+ *
+ * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
+ * socket selection, potentially done by a BPF program
+ * that ran before us.
+ *
+ * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
+ * load-balancing within reuseport group for the socket
+ * being selected.
+ *
+ * On success *ctx->sk* will point to the selected socket.
+ *
+ * Return
+ * 0 on success, or a negative errno in case of failure.
+ *
+ * * **-EAFNOSUPPORT** if socket family (*sk->family*) is
+ * not compatible with packet family (*ctx->family*).
+ *
+ * * **-EEXIST** if socket has been already selected,
+ * potentially by another program, and
+ * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
+ *
+ * * **-EINVAL** if unsupported flags were specified.
+ *
+ * * **-EPROTOTYPE** if socket L4 protocol
+ * (*sk->protocol*) doesn't match packet protocol
+ * (*ctx->protocol*).
+ *
+ * * **-ESOCKTNOSUPPORT** if socket is not in allowed
+ * state (TCP listening or UDP unconnected).
+ *
+ * u64 bpf_ktime_get_boot_ns(void)
+ * Description
+ * Return the time elapsed since system boot, in nanoseconds.
+ * Does include the time the system was suspended.
+ * See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
+ * Return
+ * Current *ktime*.
+ *
+ * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * Description
+ * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
+ * out the format string.
+ * The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * the format string itself. The *data* and *data_len* are format string
+ * arguments. The *data* are a **u64** array and corresponding format string
+ * values are stored in the array. For strings and pointers where pointees
+ * are accessed, only the pointer values are stored in the *data* array.
+ * The *data_len* is the size of *data* in bytes - must be a multiple of 8.
+ *
+ * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
+ * Reading kernel memory may fail due to either invalid address or
+ * valid address but requiring a major memory fault. If reading kernel memory
+ * fails, the string for **%s** will be an empty string, and the ip
+ * address for **%p{i,I}{4,6}** will be 0. Not returning error to
+ * bpf program is consistent with what **bpf_trace_printk**\ () does for now.
+ * Return
+ * 0 on success, or a negative error in case of failure:
+ *
+ * **-EBUSY** if per-CPU memory copy buffer is busy, can try again
+ * by returning 1 from bpf program.
+ *
+ * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
+ *
+ * **-E2BIG** if *fmt* contains too many format specifiers.
+ *
+ * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ *
+ * long bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * Description
+ * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
+ * The *m* represents the seq_file. The *data* and *len* represent the
+ * data to write in bytes.
+ * Return
+ * 0 on success, or a negative error in case of failure:
+ *
+ * **-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ *
+ * u64 bpf_sk_cgroup_id(void *sk)
+ * Description
+ * Return the cgroup v2 id of the socket *sk*.
+ *
+ * *sk* must be a non-**NULL** pointer to a socket, e.g. one
+ * returned from **bpf_sk_lookup_xxx**\ (),
+ * **bpf_sk_fullsock**\ (), etc. The format of returned id is
+ * same as in **bpf_skb_cgroup_id**\ ().
+ *
+ * This helper is available only if the kernel was compiled with
+ * the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level)
+ * Description
+ * Return id of cgroup v2 that is ancestor of cgroup associated
+ * with the *sk* at the *ancestor_level*. The root cgroup is at
+ * *ancestor_level* zero and each step down the hierarchy
+ * increments the level. If *ancestor_level* == level of cgroup
+ * associated with *sk*, then return value will be same as that
+ * of **bpf_sk_cgroup_id**\ ().
+ *
+ * The helper is useful to implement policies based on cgroups
+ * that are upper in hierarchy than immediate cgroup associated
+ * with *sk*.
+ *
+ * The format of returned id and helper limitations are same as in
+ * **bpf_sk_cgroup_id**\ ().
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ * Description
+ * Copy *size* bytes from *data* into a ring buffer *ringbuf*.
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * of new data availability is sent.
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * of new data availability is sent unconditionally.
+ * If **0** is specified in *flags*, an adaptive notification
+ * of new data availability is sent.
+ *
+ * An adaptive notification is a notification sent whenever the user-space
+ * process has caught up and consumed all available payloads. In case the user-space
+ * process is still processing a previous payload, then no notification is needed
+ * as it will process the newly added payload automatically.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
+ * Description
+ * Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ * *flags* must be 0.
+ * Return
+ * Valid pointer with *size* bytes of memory available; NULL,
+ * otherwise.
+ *
+ * void bpf_ringbuf_submit(void *data, u64 flags)
+ * Description
+ * Submit reserved ring buffer sample, pointed to by *data*.
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * of new data availability is sent.
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * of new data availability is sent unconditionally.
+ * If **0** is specified in *flags*, an adaptive notification
+ * of new data availability is sent.
+ *
+ * See 'bpf_ringbuf_output()' for the definition of adaptive notification.
+ * Return
+ * Nothing. Always succeeds.
+ *
+ * void bpf_ringbuf_discard(void *data, u64 flags)
+ * Description
+ * Discard reserved ring buffer sample, pointed to by *data*.
+ * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
+ * of new data availability is sent.
+ * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification
+ * of new data availability is sent unconditionally.
+ * If **0** is specified in *flags*, an adaptive notification
+ * of new data availability is sent.
+ *
+ * See 'bpf_ringbuf_output()' for the definition of adaptive notification.
+ * Return
+ * Nothing. Always succeeds.
+ *
+ * u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
+ * Description
+ * Query various characteristics of provided ring buffer. What
+ * exactly is queries is determined by *flags*:
+ *
+ * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed.
+ * * **BPF_RB_RING_SIZE**: The size of ring buffer.
+ * * **BPF_RB_CONS_POS**: Consumer position (can wrap around).
+ * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around).
+ *
+ * Data returned is just a momentary snapshot of actual values
+ * and could be inaccurate, so this facility should be used to
+ * power heuristics and for reporting, not to make 100% correct
+ * calculation.
+ * Return
+ * Requested value, or 0, if *flags* are not recognized.
+ *
+ * long bpf_csum_level(struct sk_buff *skb, u64 level)
+ * Description
+ * Change the skbs checksum level by one layer up or down, or
+ * reset it entirely to none in order to have the stack perform
+ * checksum validation. The level is applicable to the following
+ * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of
+ * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP |
+ * through **bpf_skb_adjust_room**\ () helper with passing in
+ * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call
+ * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since
+ * the UDP header is removed. Similarly, an encap of the latter
+ * into the former could be accompanied by a helper call to
+ * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the
+ * skb is still intended to be processed in higher layers of the
+ * stack instead of just egressing at tc.
+ *
+ * There are three supported level settings at this time:
+ *
+ * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs
+ * with CHECKSUM_UNNECESSARY.
+ * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs
+ * with CHECKSUM_UNNECESSARY.
+ * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and
+ * sets CHECKSUM_NONE to force checksum validation by the stack.
+ * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current
+ * skb->csum_level.
+ * Return
+ * 0 on success, or a negative error in case of failure. In the
+ * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level
+ * is returned or the error code -EACCES in case the skb is not
+ * subject to CHECKSUM_UNNECESSARY.
+ *
+ * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *tcp_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *udp6_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags)
+ * Description
+ * Return a user or a kernel stack in bpf program provided buffer.
+ * To achieve this, the helper needs *task*, which is a valid
+ * pointer to **struct task_struct**. To store the stacktrace, the
+ * bpf program provides *buf* with a nonnegative *size*.
+ *
+ * The last argument, *flags*, holds the number of stack frames to
+ * skip (from 0 to 255), masked with
+ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set
+ * the following flags:
+ *
+ * **BPF_F_USER_STACK**
+ * Collect a user space stack instead of a kernel stack.
+ * **BPF_F_USER_BUILD_ID**
+ * Collect buildid+offset instead of ips for user stack,
+ * only valid if **BPF_F_USER_STACK** is also specified.
+ *
+ * **bpf_get_task_stack**\ () can collect up to
+ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject
+ * to sufficient large buffer size. Note that
+ * this limit can be controlled with the **sysctl** program, and
+ * that it should be manually increased in order to profile long
+ * user stacks (such as stacks for Java programs). To do so, use:
+ *
+ * ::
+ *
+ * # sysctl kernel.perf_event_max_stack=<new value>
+ * Return
+ * The non-negative copied *buf* length equal to or less than
+ * *size* on success, or a negative error in case of failure.
+ *
+ * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
+ * Description
+ * Load header option. Support reading a particular TCP header
+ * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**).
+ *
+ * If *flags* is 0, it will search the option from the
+ * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops**
+ * has details on what skb_data contains under different
+ * *skops*\ **->op**.
+ *
+ * The first byte of the *searchby_res* specifies the
+ * kind that it wants to search.
+ *
+ * If the searching kind is an experimental kind
+ * (i.e. 253 or 254 according to RFC6994). It also
+ * needs to specify the "magic" which is either
+ * 2 bytes or 4 bytes. It then also needs to
+ * specify the size of the magic by using
+ * the 2nd byte which is "kind-length" of a TCP
+ * header option and the "kind-length" also
+ * includes the first 2 bytes "kind" and "kind-length"
+ * itself as a normal TCP header option also does.
+ *
+ * For example, to search experimental kind 254 with
+ * 2 byte magic 0xeB9F, the searchby_res should be
+ * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ].
+ *
+ * To search for the standard window scale option (3),
+ * the *searchby_res* should be [ 3, 0, 0, .... 0 ].
+ * Note, kind-length must be 0 for regular option.
+ *
+ * Searching for No-Op (0) and End-of-Option-List (1) are
+ * not supported.
+ *
+ * *len* must be at least 2 bytes which is the minimal size
+ * of a header option.
+ *
+ * Supported flags:
+ *
+ * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the
+ * saved_syn packet or the just-received syn packet.
+ *
+ * Return
+ * > 0 when found, the header option is copied to *searchby_res*.
+ * The return value is the total length copied. On failure, a
+ * negative error code is returned:
+ *
+ * **-EINVAL** if a parameter is invalid.
+ *
+ * **-ENOMSG** if the option is not found.
+ *
+ * **-ENOENT** if no syn packet is available when
+ * **BPF_LOAD_HDR_OPT_TCP_SYN** is used.
+ *
+ * **-ENOSPC** if there is not enough space. Only *len* number of
+ * bytes are copied.
+ *
+ * **-EFAULT** on failure to parse the header options in the
+ * packet.
+ *
+ * **-EPERM** if the helper cannot be used under the current
+ * *skops*\ **->op**.
+ *
+ * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags)
+ * Description
+ * Store header option. The data will be copied
+ * from buffer *from* with length *len* to the TCP header.
+ *
+ * The buffer *from* should have the whole option that
+ * includes the kind, kind-length, and the actual
+ * option data. The *len* must be at least kind-length
+ * long. The kind-length does not have to be 4 byte
+ * aligned. The kernel will take care of the padding
+ * and setting the 4 bytes aligned value to th->doff.
+ *
+ * This helper will check for duplicated option
+ * by searching the same option in the outgoing skb.
+ *
+ * This helper can only be called during
+ * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
+ *
+ * Return
+ * 0 on success, or negative error in case of failure:
+ *
+ * **-EINVAL** If param is invalid.
+ *
+ * **-ENOSPC** if there is not enough space in the header.
+ * Nothing has been written
+ *
+ * **-EEXIST** if the option already exists.
+ *
+ * **-EFAULT** on failure to parse the existing header options.
+ *
+ * **-EPERM** if the helper cannot be used under the current
+ * *skops*\ **->op**.
+ *
+ * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags)
+ * Description
+ * Reserve *len* bytes for the bpf header option. The
+ * space will be used by **bpf_store_hdr_opt**\ () later in
+ * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**.
+ *
+ * If **bpf_reserve_hdr_opt**\ () is called multiple times,
+ * the total number of bytes will be reserved.
+ *
+ * This helper can only be called during
+ * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**.
+ *
+ * Return
+ * 0 on success, or negative error in case of failure:
+ *
+ * **-EINVAL** if a parameter is invalid.
+ *
+ * **-ENOSPC** if there is not enough space in the header.
+ *
+ * **-EPERM** if the helper cannot be used under the current
+ * *skops*\ **->op**.
+ *
+ * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags)
+ * Description
+ * Get a bpf_local_storage from an *inode*.
+ *
+ * Logically, it could be thought of as getting the value from
+ * a *map* with *inode* as the **key**. From this
+ * perspective, the usage is not much different from
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this
+ * helper enforces the key must be an inode and the map must also
+ * be a **BPF_MAP_TYPE_INODE_STORAGE**.
+ *
+ * Underneath, the value is stored locally at *inode* instead of
+ * the *map*. The *map* is used as the bpf-local-storage
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
+ * searched against all bpf_local_storage residing at *inode*.
+ *
+ * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ * used such that a new bpf_local_storage will be
+ * created if one does not exist. *value* can be used
+ * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ * the initial value of a bpf_local_storage. If *value* is
+ * **NULL**, the new bpf_local_storage will be zero initialized.
+ * Return
+ * A bpf_local_storage pointer is returned on success.
+ *
+ * **NULL** if not found or there was an error in adding
+ * a new bpf_local_storage.
+ *
+ * int bpf_inode_storage_delete(struct bpf_map *map, void *inode)
+ * Description
+ * Delete a bpf_local_storage from an *inode*.
+ * Return
+ * 0 on success.
+ *
+ * **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * long bpf_d_path(struct path *path, char *buf, u32 sz)
+ * Description
+ * Return full path for given **struct path** object, which
+ * needs to be the kernel BTF *path* object. The path is
+ * returned in the provided buffer *buf* of size *sz* and
+ * is zero terminated.
+ *
+ * Return
+ * On success, the strictly positive length of the string,
+ * including the trailing NUL character. On error, a negative
+ * value.
+ *
+ * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
+ * Description
+ * Read *size* bytes from user space address *user_ptr* and store
+ * the data in *dst*. This is a wrapper of **copy_from_user**\ ().
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags)
+ * Description
+ * Use BTF to store a string representation of *ptr*->ptr in *str*,
+ * using *ptr*->type_id. This value should specify the type
+ * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1)
+ * can be used to look up vmlinux BTF type ids. Traversing the
+ * data structure using BTF, the type information and values are
+ * stored in the first *str_size* - 1 bytes of *str*. Safe copy of
+ * the pointer data is carried out to avoid kernel crashes during
+ * operation. Smaller types can use string space on the stack;
+ * larger programs can use map data to store the string
+ * representation.
+ *
+ * The string can be subsequently shared with userspace via
+ * bpf_perf_event_output() or ring buffer interfaces.
+ * bpf_trace_printk() is to be avoided as it places too small
+ * a limit on string size to be useful.
+ *
+ * *flags* is a combination of
+ *
+ * **BTF_F_COMPACT**
+ * no formatting around type information
+ * **BTF_F_NONAME**
+ * no struct/union member names/types
+ * **BTF_F_PTR_RAW**
+ * show raw (unobfuscated) pointer values;
+ * equivalent to printk specifier %px.
+ * **BTF_F_ZERO**
+ * show zero-valued struct/union members; they
+ * are not displayed by default
+ *
+ * Return
+ * The number of bytes that were written (or would have been
+ * written if output had to be truncated due to string size),
+ * or a negative error in cases of failure.
+ *
+ * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags)
+ * Description
+ * Use BTF to write to seq_write a string representation of
+ * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf().
+ * *flags* are identical to those used for bpf_snprintf_btf.
+ * Return
+ * 0 on success or a negative error in case of failure.
+ *
+ * u64 bpf_skb_cgroup_classid(struct sk_buff *skb)
+ * Description
+ * See **bpf_get_cgroup_classid**\ () for the main description.
+ * This helper differs from **bpf_get_cgroup_classid**\ () in that
+ * the cgroup v1 net_cls class is retrieved only from the *skb*'s
+ * associated socket instead of the current process.
+ * Return
+ * The id is returned or 0 in case the id could not be retrieved.
+ *
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
+ * Description
+ * Redirect the packet to another net device of index *ifindex*
+ * and fill in L2 addresses from neighboring subsystem. This helper
+ * is somewhat similar to **bpf_redirect**\ (), except that it
+ * populates L2 addresses as well, meaning, internally, the helper
+ * relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ * The helper will perform a FIB lookup based on the skb's
+ * networking header to get the address of the next hop, unless
+ * this is supplied by the caller in the *params* argument. The
+ * *plen* argument indicates the len of *params* and should be set
+ * to 0 if *params* is NULL.
+ *
+ * The *flags* argument is reserved and must be 0. The helper is
+ * currently only supported for tc BPF program types, and enabled
+ * for IPv4 and IPv6 protocols.
+ * Return
+ * The helper returns **TC_ACT_REDIRECT** on success or
+ * **TC_ACT_SHOT** on error.
+ *
+ * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
+ * Description
+ * Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ * pointer to the percpu kernel variable on *cpu*. A ksym is an
+ * extern variable decorated with '__ksym'. For ksym, there is a
+ * global var (either static or global) defined of the same name
+ * in the kernel. The ksym is percpu if the global var is percpu.
+ * The returned pointer points to the global percpu var on *cpu*.
+ *
+ * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
+ * kernel, except that bpf_per_cpu_ptr() may return NULL. This
+ * happens if *cpu* is larger than nr_cpu_ids. The caller of
+ * bpf_per_cpu_ptr() must check the returned value.
+ * Return
+ * A pointer pointing to the kernel percpu variable on *cpu*, or
+ * NULL, if *cpu* is invalid.
+ *
+ * void *bpf_this_cpu_ptr(const void *percpu_ptr)
+ * Description
+ * Take a pointer to a percpu ksym, *percpu_ptr*, and return a
+ * pointer to the percpu kernel variable on this cpu. See the
+ * description of 'ksym' in **bpf_per_cpu_ptr**\ ().
+ *
+ * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
+ * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
+ * never return NULL.
+ * Return
+ * A pointer pointing to the kernel percpu variable on this cpu.
+ *
+ * long bpf_redirect_peer(u32 ifindex, u64 flags)
+ * Description
+ * Redirect the packet to another net device of index *ifindex*.
+ * This helper is somewhat similar to **bpf_redirect**\ (), except
+ * that the redirection happens to the *ifindex*' peer device and
+ * the netns switch takes place from ingress to ingress without
+ * going through the CPU's backlog queue.
+ *
+ * The *flags* argument is reserved and must be 0. The helper is
+ * currently only supported for tc BPF program types at the ingress
+ * hook and for veth device types. The peer device must reside in a
+ * different network namespace.
+ * Return
+ * The helper returns **TC_ACT_REDIRECT** on success or
+ * **TC_ACT_SHOT** on error.
+ *
+ * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags)
+ * Description
+ * Get a bpf_local_storage from the *task*.
+ *
+ * Logically, it could be thought of as getting the value from
+ * a *map* with *task* as the **key**. From this
+ * perspective, the usage is not much different from
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this
+ * helper enforces the key must be a task_struct and the map must also
+ * be a **BPF_MAP_TYPE_TASK_STORAGE**.
+ *
+ * Underneath, the value is stored locally at *task* instead of
+ * the *map*. The *map* is used as the bpf-local-storage
+ * "type". The bpf-local-storage "type" (i.e. the *map*) is
+ * searched against all bpf_local_storage residing at *task*.
+ *
+ * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ * used such that a new bpf_local_storage will be
+ * created if one does not exist. *value* can be used
+ * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ * the initial value of a bpf_local_storage. If *value* is
+ * **NULL**, the new bpf_local_storage will be zero initialized.
+ * Return
+ * A bpf_local_storage pointer is returned on success.
+ *
+ * **NULL** if not found or there was an error in adding
+ * a new bpf_local_storage.
+ *
+ * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task)
+ * Description
+ * Delete a bpf_local_storage from a *task*.
+ * Return
+ * 0 on success.
+ *
+ * **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * struct task_struct *bpf_get_current_task_btf(void)
+ * Description
+ * Return a BTF pointer to the "current" task.
+ * This pointer can also be used in helpers that accept an
+ * *ARG_PTR_TO_BTF_ID* of type *task_struct*.
+ * Return
+ * Pointer to the current task.
+ *
+ * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags)
+ * Description
+ * Set or clear certain options on *bprm*:
+ *
+ * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit
+ * which sets the **AT_SECURE** auxv for glibc. The bit
+ * is cleared if the flag is not specified.
+ * Return
+ * **-EINVAL** if invalid *flags* are passed, zero otherwise.
+ *
+ * u64 bpf_ktime_get_coarse_ns(void)
+ * Description
+ * Return a coarse-grained version of the time elapsed since
+ * system boot, in nanoseconds. Does not include time the system
+ * was suspended.
+ *
+ * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**)
+ * Return
+ * Current *ktime*.
+ *
+ * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size)
+ * Description
+ * Returns the stored IMA hash of the *inode* (if it's available).
+ * If the hash is larger than *size*, then only *size*
+ * bytes will be copied to *dst*
+ * Return
+ * The **hash_algo** is returned on success,
+ * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
+ * invalid arguments are passed.
+ *
+ * struct socket *bpf_sock_from_file(struct file *file)
+ * Description
+ * If the given file represents a socket, returns the associated
+ * socket.
+ * Return
+ * A pointer to a struct socket on success or NULL if the file is
+ * not a socket.
+ *
+ * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
+ * Description
+ * Check packet size against exceeding MTU of net device (based
+ * on *ifindex*). This helper will likely be used in combination
+ * with helpers that adjust/change the packet size.
+ *
+ * The argument *len_diff* can be used for querying with a planned
+ * size change. This allows to check MTU prior to changing packet
+ * ctx. Providing a *len_diff* adjustment that is larger than the
+ * actual packet size (resulting in negative packet size) will in
+ * principle not exceed the MTU, which is why it is not considered
+ * a failure. Other BPF helpers are needed for performing the
+ * planned size change; therefore the responsibility for catching
+ * a negative packet size belongs in those helpers.
+ *
+ * Specifying *ifindex* zero means the MTU check is performed
+ * against the current net device. This is practical if this isn't
+ * used prior to redirect.
+ *
+ * On input *mtu_len* must be a valid pointer, else verifier will
+ * reject BPF program. If the value *mtu_len* is initialized to
+ * zero then the ctx packet size is use. When value *mtu_len* is
+ * provided as input this specify the L3 length that the MTU check
+ * is done against. Remember XDP and TC length operate at L2, but
+ * this value is L3 as this correlate to MTU and IP-header tot_len
+ * values which are L3 (similar behavior as bpf_fib_lookup).
+ *
+ * The Linux kernel route table can configure MTUs on a more
+ * specific per route level, which is not provided by this helper.
+ * For route level MTU checks use the **bpf_fib_lookup**\ ()
+ * helper.
+ *
+ * *ctx* is either **struct xdp_md** for XDP programs or
+ * **struct sk_buff** for tc cls_act programs.
+ *
+ * The *flags* argument can be a combination of one or more of the
+ * following values:
+ *
+ * **BPF_MTU_CHK_SEGS**
+ * This flag will only works for *ctx* **struct sk_buff**.
+ * If packet context contains extra packet segment buffers
+ * (often knows as GSO skb), then MTU check is harder to
+ * check at this point, because in transmit path it is
+ * possible for the skb packet to get re-segmented
+ * (depending on net device features). This could still be
+ * a MTU violation, so this flag enables performing MTU
+ * check against segments, with a different violation
+ * return code to tell it apart. Check cannot use len_diff.
+ *
+ * On return *mtu_len* pointer contains the MTU value of the net
+ * device. Remember the net device configured MTU is the L3 size,
+ * which is returned here and XDP and TC length operate at L2.
+ * Helper take this into account for you, but remember when using
+ * MTU value in your BPF-code.
+ *
+ * Return
+ * * 0 on success, and populate MTU value in *mtu_len* pointer.
+ *
+ * * < 0 if any input argument is invalid (*mtu_len* not updated)
+ *
+ * MTU violations return positive values, but also populate MTU
+ * value in *mtu_len* pointer, as this can be needed for
+ * implementing PMTU handing:
+ *
+ * * **BPF_MTU_CHK_RET_FRAG_NEEDED**
+ * * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
+ *
+ * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ * Description
+ * For each element in **map**, call **callback_fn** function with
+ * **map**, **callback_ctx** and other map-specific parameters.
+ * The **callback_fn** should be a static function and
+ * the **callback_ctx** should be a pointer to the stack.
+ * The **flags** is used to control certain aspects of the helper.
+ * Currently, the **flags** must be 0.
+ *
+ * The following are a list of supported map types and their
+ * respective expected callback signatures:
+ *
+ * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ *
+ * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ *
+ * For per_cpu maps, the map_value is the value on the cpu where the
+ * bpf_prog is running.
+ *
+ * If **callback_fn** return 0, the helper will continue to the next
+ * element. If return value is 1, the helper will skip the rest of
+ * elements and return. Other return values are not used now.
+ *
+ * Return
+ * The number of traversed map elements for success, **-EINVAL** for
+ * invalid **flags**.
+ *
+ * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len)
+ * Description
+ * Outputs a string into the **str** buffer of size **str_size**
+ * based on a format string stored in a read-only map pointed by
+ * **fmt**.
+ *
+ * Each format specifier in **fmt** corresponds to one u64 element
+ * in the **data** array. For strings and pointers where pointees
+ * are accessed, only the pointer values are stored in the *data*
+ * array. The *data_len* is the size of *data* in bytes - must be
+ * a multiple of 8.
+ *
+ * Formats **%s** and **%p{i,I}{4,6}** require to read kernel
+ * memory. Reading kernel memory may fail due to either invalid
+ * address or valid address but requiring a major memory fault. If
+ * reading kernel memory fails, the string for **%s** will be an
+ * empty string, and the ip address for **%p{i,I}{4,6}** will be 0.
+ * Not returning error to bpf program is consistent with what
+ * **bpf_trace_printk**\ () does for now.
+ *
+ * Return
+ * The strictly positive length of the formatted string, including
+ * the trailing zero character. If the return value is greater than
+ * **str_size**, **str** contains a truncated string, guaranteed to
+ * be zero-terminated except when **str_size** is 0.
+ *
+ * Or **-EBUSY** if the per-CPU memory copy buffer is busy.
+ *
+ * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size)
+ * Description
+ * Execute bpf syscall with given arguments.
+ * Return
+ * A syscall result.
+ *
+ * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags)
+ * Description
+ * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs.
+ * Return
+ * Returns btf_id and btf_obj_fd in lower and upper 32 bits.
+ *
+ * long bpf_sys_close(u32 fd)
+ * Description
+ * Execute close syscall for given FD.
+ * Return
+ * A syscall result.
+ *
+ * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags)
+ * Description
+ * Initialize the timer.
+ * First 4 bits of *flags* specify clockid.
+ * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed.
+ * All other bits of *flags* are reserved.
+ * The verifier will reject the program if *timer* is not from
+ * the same *map*.
+ * Return
+ * 0 on success.
+ * **-EBUSY** if *timer* is already initialized.
+ * **-EINVAL** if invalid *flags* are passed.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn)
+ * Description
+ * Configure the timer to call *callback_fn* static function.
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
+ *
+ * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags)
+ * Description
+ * Set timer expiration N nanoseconds from the current time. The
+ * configured callback will be invoked in soft irq context on some cpu
+ * and will not repeat unless another bpf_timer_start() is made.
+ * In such case the next invocation can migrate to a different cpu.
+ * Since struct bpf_timer is a field inside map element the map
+ * owns the timer. The bpf_timer_set_callback() will increment refcnt
+ * of BPF program to make sure that callback_fn code stays valid.
+ * When user space reference to a map reaches zero all timers
+ * in a map are cancelled and corresponding program's refcnts are
+ * decremented. This is done to make sure that Ctrl-C of a user
+ * process doesn't leave any timers running. If map is pinned in
+ * bpffs the callback_fn can re-arm itself indefinitely.
+ * bpf_map_update/delete_elem() helpers and user space sys_bpf commands
+ * cancel and free the timer in the given map element.
+ * The map can contain timers that invoke callback_fn-s from different
+ * programs. The same callback_fn can serve different timers from
+ * different maps if key/value layout matches across maps.
+ * Every bpf_timer_set_callback() can have different callback_fn.
+ *
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier
+ * or invalid *flags* are passed.
+ *
+ * long bpf_timer_cancel(struct bpf_timer *timer)
+ * Description
+ * Cancel the timer and wait for callback_fn to finish if it was running.
+ * Return
+ * 0 if the timer was not active.
+ * 1 if the timer was active.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its
+ * own timer which would have led to a deadlock otherwise.
+ *
+ * u64 bpf_get_func_ip(void *ctx)
+ * Description
+ * Get address of the traced function (for tracing and kprobe programs).
+ * Return
+ * Address of the traced function.
+ * 0 for kprobes placed within the function (not at the entry).
+ *
+ * u64 bpf_get_attach_cookie(void *ctx)
+ * Description
+ * Get bpf_cookie value provided (optionally) during the program
+ * attachment. It might be different for each individual
+ * attachment, even if BPF program itself is the same.
+ * Expects BPF program context *ctx* as a first argument.
+ *
+ * Supported for the following program types:
+ * - kprobe/uprobe;
+ * - tracepoint;
+ * - perf_event.
+ * Return
+ * Value specified by user at BPF link creation/attachment time
+ * or 0, if it was not specified.
+ *
+ * long bpf_task_pt_regs(struct task_struct *task)
+ * Description
+ * Get the struct pt_regs associated with **task**.
+ * Return
+ * A pointer to struct pt_regs.
+ *
+ * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags)
+ * Description
+ * Get branch trace from hardware engines like Intel LBR. The
+ * hardware engine is stopped shortly after the helper is
+ * called. Therefore, the user need to filter branch entries
+ * based on the actual use case. To capture branch trace
+ * before the trigger point of the BPF program, the helper
+ * should be called at the beginning of the BPF program.
+ *
+ * The data is stored as struct perf_branch_entry into output
+ * buffer *entries*. *size* is the size of *entries* in bytes.
+ * *flags* is reserved for now and must be zero.
+ *
+ * Return
+ * On success, number of bytes written to *buf*. On error, a
+ * negative value.
+ *
+ * **-EINVAL** if *flags* is not zero.
+ *
+ * **-ENOENT** if architecture does not support branch records.
+ *
+ * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * Description
+ * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64
+ * to format and can handle more format args as a result.
+ *
+ * Arguments are to be used as in **bpf_seq_printf**\ () helper.
+ * Return
+ * The number of bytes written to the buffer, or a negative error
+ * in case of failure.
+ *
+ * struct unix_sock *bpf_skc_to_unix_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *unix_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res)
+ * Description
+ * Get the address of a kernel symbol, returned in *res*. *res* is
+ * set to 0 if the symbol is not found.
+ * Return
+ * On success, zero. On error, a negative value.
+ *
+ * **-EINVAL** if *flags* is not zero.
+ *
+ * **-EINVAL** if string *name* is not the same size as *name_sz*.
+ *
+ * **-ENOENT** if symbol is not found.
+ *
+ * **-EPERM** if caller does not have permission to obtain kernel address.
+ *
+ * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags)
+ * Description
+ * Find vma of *task* that contains *addr*, call *callback_fn*
+ * function with *task*, *vma*, and *callback_ctx*.
+ * The *callback_fn* should be a static function and
+ * the *callback_ctx* should be a pointer to the stack.
+ * The *flags* is used to control certain aspects of the helper.
+ * Currently, the *flags* must be 0.
+ *
+ * The expected callback signature is
+ *
+ * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx);
+ *
+ * Return
+ * 0 on success.
+ * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*.
+ * **-EBUSY** if failed to try lock mmap_lock.
+ * **-EINVAL** for invalid **flags**.
+ *
+ * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags)
+ * Description
+ * For **nr_loops**, call **callback_fn** function
+ * with **callback_ctx** as the context parameter.
+ * The **callback_fn** should be a static function and
+ * the **callback_ctx** should be a pointer to the stack.
+ * The **flags** is used to control certain aspects of the helper.
+ * Currently, the **flags** must be 0. Currently, nr_loops is
+ * limited to 1 << 23 (~8 million) loops.
+ *
+ * long (\*callback_fn)(u32 index, void \*ctx);
+ *
+ * where **index** is the current index in the loop. The index
+ * is zero-indexed.
+ *
+ * If **callback_fn** returns 0, the helper will continue to the next
+ * loop. If return value is 1, the helper will skip the rest of
+ * the loops and return. Other return values are not used now,
+ * and will be rejected by the verifier.
+ *
+ * Return
+ * The number of loops performed, **-EINVAL** for invalid **flags**,
+ * **-E2BIG** if **nr_loops** exceeds the maximum number of loops.
+ *
+ * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2)
+ * Description
+ * Do strncmp() between **s1** and **s2**. **s1** doesn't need
+ * to be null-terminated and **s1_sz** is the maximum storage
+ * size of **s1**. **s2** must be a read-only string.
+ * Return
+ * An integer less than, equal to, or greater than zero
+ * if the first **s1_sz** bytes of **s1** is found to be
+ * less than, to match, or be greater than **s2**.
+ *
+ * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
+ * Description
+ * Get **n**-th argument register (zero based) of the traced function (for tracing programs)
+ * returned in **value**.
+ *
+ * Return
+ * 0 on success.
+ * **-EINVAL** if n >= argument register count of traced function.
+ *
+ * long bpf_get_func_ret(void *ctx, u64 *value)
+ * Description
+ * Get return value of the traced function (for tracing programs)
+ * in **value**.
+ *
+ * Return
+ * 0 on success.
+ * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN.
+ *
+ * long bpf_get_func_arg_cnt(void *ctx)
+ * Description
+ * Get number of registers of the traced function (for tracing programs) where
+ * function arguments are stored in these registers.
+ *
+ * Return
+ * The number of argument registers of the traced function.
+ *
+ * int bpf_get_retval(void)
+ * Description
+ * Get the BPF program's return value that will be returned to the upper layers.
+ *
+ * This helper is currently supported by cgroup programs and only by the hooks
+ * where BPF program's return value is returned to the userspace via errno.
+ * Return
+ * The BPF program's return value.
+ *
+ * int bpf_set_retval(int retval)
+ * Description
+ * Set the BPF program's return value that will be returned to the upper layers.
+ *
+ * This helper is currently supported by cgroup programs and only by the hooks
+ * where BPF program's return value is returned to the userspace via errno.
+ *
+ * Note that there is the following corner case where the program exports an error
+ * via bpf_set_retval but signals success via 'return 1':
+ *
+ * bpf_set_retval(-EPERM);
+ * return 1;
+ *
+ * In this case, the BPF program's return value will use helper's -EPERM. This
+ * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case.
+ *
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md)
+ * Description
+ * Get the total size of a given xdp buff (linear and paged area)
+ * Return
+ * The total size of a given xdp buffer.
+ *
+ * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ * Description
+ * This helper is provided as an easy way to load data from a
+ * xdp buffer. It can be used to load *len* bytes from *offset* from
+ * the frame associated to *xdp_md*, into the buffer pointed by
+ * *buf*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len)
+ * Description
+ * Store *len* bytes from buffer *buf* into the frame
+ * associated to *xdp_md*, at *offset*.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
+ * Description
+ * Read *size* bytes from user space address *user_ptr* in *tsk*'s
+ * address space, and stores the data in *dst*. *flags* is not
+ * used yet and is provided for future extensibility. This helper
+ * can only be used by sleepable programs.
+ * Return
+ * 0 on success, or a negative error in case of failure. On error
+ * *dst* buffer is zeroed out.
+ *
+ * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type)
+ * Description
+ * Change the __sk_buff->tstamp_type to *tstamp_type*
+ * and set *tstamp* to the __sk_buff->tstamp together.
+ *
+ * If there is no need to change the __sk_buff->tstamp_type,
+ * the tstamp value can be directly written to __sk_buff->tstamp
+ * instead.
+ *
+ * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that
+ * will be kept during bpf_redirect_*(). A non zero
+ * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO
+ * *tstamp_type*.
+ *
+ * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used
+ * with a zero *tstamp*.
+ *
+ * Only IPv4 and IPv6 skb->protocol are supported.
+ *
+ * This function is most useful when it needs to set a
+ * mono delivery time to __sk_buff->tstamp and then
+ * bpf_redirect_*() to the egress of an iface. For example,
+ * changing the (rcv) timestamp in __sk_buff->tstamp at
+ * ingress to a mono delivery time and then bpf_redirect_*()
+ * to sch_fq@phy-dev.
+ * Return
+ * 0 on success.
+ * **-EINVAL** for invalid input
+ * **-EOPNOTSUPP** for unsupported protocol
+ *
+ * long bpf_ima_file_hash(struct file *file, void *dst, u32 size)
+ * Description
+ * Returns a calculated IMA hash of the *file*.
+ * If the hash is larger than *size*, then only *size*
+ * bytes will be copied to *dst*
+ * Return
+ * The **hash_algo** is returned on success,
+ * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ * invalid arguments are passed.
+ *
+ * void *bpf_kptr_xchg(void *map_value, void *ptr)
+ * Description
+ * Exchange kptr at pointer *map_value* with *ptr*, and return the
+ * old value. *ptr* can be NULL, otherwise it must be a referenced
+ * pointer which will be released when this helper is called.
+ * Return
+ * The old value of kptr (which can be NULL). The returned pointer
+ * if not NULL, is a reference which must be released using its
+ * corresponding release function, or moved into a BPF map before
+ * program exit.
+ *
+ * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu)
+ * Description
+ * Perform a lookup in *percpu map* for an entry associated to
+ * *key* on *cpu*.
+ * Return
+ * Map value associated to *key* on *cpu*, or **NULL** if no entry
+ * was found or *cpu* is invalid.
+ *
+ * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk)
+ * Description
+ * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer.
+ * Return
+ * *sk* if casting is valid, or **NULL** otherwise.
+ *
+ * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ * Description
+ * Get a dynptr to local memory *data*.
+ *
+ * *data* must be a ptr to a map value.
+ * The maximum *size* supported is DYNPTR_MAX_SIZE.
+ * *flags* is currently unused.
+ * Return
+ * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE,
+ * -EINVAL if flags is not 0.
+ *
+ * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr)
+ * Description
+ * Reserve *size* bytes of payload in a ring buffer *ringbuf*
+ * through the dynptr interface. *flags* must be 0.
+ *
+ * Please note that a corresponding bpf_ringbuf_submit_dynptr or
+ * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the
+ * reservation fails. This is enforced by the verifier.
+ * Return
+ * 0 on success, or a negative error in case of failure.
+ *
+ * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags)
+ * Description
+ * Submit reserved ring buffer sample, pointed to by *data*,
+ * through the dynptr interface. This is a no-op if the dynptr is
+ * invalid/null.
+ *
+ * For more information on *flags*, please see
+ * 'bpf_ringbuf_submit'.
+ * Return
+ * Nothing. Always succeeds.
+ *
+ * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags)
+ * Description
+ * Discard reserved ring buffer sample through the dynptr
+ * interface. This is a no-op if the dynptr is invalid/null.
+ *
+ * For more information on *flags*, please see
+ * 'bpf_ringbuf_discard'.
+ * Return
+ * Nothing. Always succeeds.
+ *
+ * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags)
+ * Description
+ * Read *len* bytes from *src* into *dst*, starting from *offset*
+ * into *src*.
+ * *flags* is currently unused.
+ * Return
+ * 0 on success, -E2BIG if *offset* + *len* exceeds the length
+ * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if
+ * *flags* is not 0.
+ *
+ * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags)
+ * Description
+ * Write *len* bytes from *src* into *dst*, starting from *offset*
+ * into *dst*.
+ * *flags* is currently unused.
+ * Return
+ * 0 on success, -E2BIG if *offset* + *len* exceeds the length
+ * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst*
+ * is a read-only dynptr or if *flags* is not 0.
+ *
+ * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len)
+ * Description
+ * Get a pointer to the underlying dynptr data.
+ *
+ * *len* must be a statically known value. The returned data slice
+ * is invalidated whenever the dynptr is invalidated.
+ * Return
+ * Pointer to the underlying dynptr data, NULL if the dynptr is
+ * read-only, if the dynptr is invalid, or if the offset and length
+ * is out of bounds.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len)
+ * Description
+ * Try to issue a SYN cookie for the packet with corresponding
+ * IPv4/TCP headers, *iph* and *th*, without depending on a
+ * listening socket.
+ *
+ * *iph* points to the IPv4 header.
+ *
+ * *th* points to the start of the TCP header, while *th_len*
+ * contains the length of the TCP header (at least
+ * **sizeof**\ (**struct tcphdr**)).
+ * Return
+ * On success, lower 32 bits hold the generated SYN cookie in
+ * followed by 16 bits which hold the MSS value for that cookie,
+ * and the top 16 bits are unused.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EINVAL** if *th_len* is invalid.
+ *
+ * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len)
+ * Description
+ * Try to issue a SYN cookie for the packet with corresponding
+ * IPv6/TCP headers, *iph* and *th*, without depending on a
+ * listening socket.
+ *
+ * *iph* points to the IPv6 header.
+ *
+ * *th* points to the start of the TCP header, while *th_len*
+ * contains the length of the TCP header (at least
+ * **sizeof**\ (**struct tcphdr**)).
+ * Return
+ * On success, lower 32 bits hold the generated SYN cookie in
+ * followed by 16 bits which hold the MSS value for that cookie,
+ * and the top 16 bits are unused.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EINVAL** if *th_len* is invalid.
+ *
+ * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th)
+ * Description
+ * Check whether *iph* and *th* contain a valid SYN cookie ACK
+ * without depending on a listening socket.
+ *
+ * *iph* points to the IPv4 header.
+ *
+ * *th* points to the TCP header.
+ * Return
+ * 0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EACCES** if the SYN cookie is not valid.
+ *
+ * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th)
+ * Description
+ * Check whether *iph* and *th* contain a valid SYN cookie ACK
+ * without depending on a listening socket.
+ *
+ * *iph* points to the IPv6 header.
+ *
+ * *th* points to the TCP header.
+ * Return
+ * 0 if *iph* and *th* are a valid SYN cookie ACK.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EACCES** if the SYN cookie is not valid.
+ *
+ * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin.
+ *
+ * u64 bpf_ktime_get_tai_ns(void)
+ * Description
+ * A nonsettable system-wide clock derived from wall-clock time but
+ * ignoring leap seconds. This clock does not experience
+ * discontinuities and backwards jumps caused by NTP inserting leap
+ * seconds as CLOCK_REALTIME does.
+ *
+ * See: **clock_gettime**\ (**CLOCK_TAI**)
+ * Return
+ * Current *ktime*.
+ *
+ * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags)
+ * Description
+ * Drain samples from the specified user ring buffer, and invoke
+ * the provided callback for each such sample:
+ *
+ * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx);
+ *
+ * If **callback_fn** returns 0, the helper will continue to try
+ * and drain the next sample, up to a maximum of
+ * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1,
+ * the helper will skip the rest of the samples and return. Other
+ * return values are not used now, and will be rejected by the
+ * verifier.
+ * Return
+ * The number of drained samples if no error was encountered while
+ * draining samples, or 0 if no samples were present in the ring
+ * buffer. If a user-space producer was epoll-waiting on this map,
+ * and at least one sample was drained, they will receive an event
+ * notification notifying them of available space in the ring
+ * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this
+ * function, no wakeup notification will be sent. If the
+ * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will
+ * be sent even if no sample was drained.
+ *
+ * On failure, the returned value is one of the following:
+ *
+ * **-EBUSY** if the ring buffer is contended, and another calling
+ * context was concurrently draining the ring buffer.
+ *
+ * **-EINVAL** if user-space is not properly tracking the ring
+ * buffer due to the producer position not being aligned to 8
+ * bytes, a sample not being aligned to 8 bytes, or the producer
+ * position not matching the advertised length of a sample.
+ *
+ * **-E2BIG** if user-space has tried to publish a sample which is
+ * larger than the size of the ring buffer, or which cannot fit
+ * within a struct bpf_dynptr.
+ *
+ * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags)
+ * Description
+ * Get a bpf_local_storage from the *cgroup*.
+ *
+ * Logically, it could be thought of as getting the value from
+ * a *map* with *cgroup* as the **key**. From this
+ * perspective, the usage is not much different from
+ * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this
+ * helper enforces the key must be a cgroup struct and the map must also
+ * be a **BPF_MAP_TYPE_CGRP_STORAGE**.
+ *
+ * In reality, the local-storage value is embedded directly inside of the
+ * *cgroup* object itself, rather than being located in the
+ * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is
+ * queried for some *map* on a *cgroup* object, the kernel will perform an
+ * O(n) iteration over all of the live local-storage values for that
+ * *cgroup* object until the local-storage value for the *map* is found.
+ *
+ * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be
+ * used such that a new bpf_local_storage will be
+ * created if one does not exist. *value* can be used
+ * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify
+ * the initial value of a bpf_local_storage. If *value* is
+ * **NULL**, the new bpf_local_storage will be zero initialized.
+ * Return
+ * A bpf_local_storage pointer is returned on success.
+ *
+ * **NULL** if not found or there was an error in adding
+ * a new bpf_local_storage.
+ *
+ * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup)
+ * Description
+ * Delete a bpf_local_storage from a *cgroup*.
+ * Return
+ * 0 on success.
+ *
+ * **-ENOENT** if the bpf_local_storage cannot be found.
+ */
+#define ___BPF_FUNC_MAPPER(FN, ctx...) \
+ FN(unspec, 0, ##ctx) \
+ FN(map_lookup_elem, 1, ##ctx) \
+ FN(map_update_elem, 2, ##ctx) \
+ FN(map_delete_elem, 3, ##ctx) \
+ FN(probe_read, 4, ##ctx) \
+ FN(ktime_get_ns, 5, ##ctx) \
+ FN(trace_printk, 6, ##ctx) \
+ FN(get_prandom_u32, 7, ##ctx) \
+ FN(get_smp_processor_id, 8, ##ctx) \
+ FN(skb_store_bytes, 9, ##ctx) \
+ FN(l3_csum_replace, 10, ##ctx) \
+ FN(l4_csum_replace, 11, ##ctx) \
+ FN(tail_call, 12, ##ctx) \
+ FN(clone_redirect, 13, ##ctx) \
+ FN(get_current_pid_tgid, 14, ##ctx) \
+ FN(get_current_uid_gid, 15, ##ctx) \
+ FN(get_current_comm, 16, ##ctx) \
+ FN(get_cgroup_classid, 17, ##ctx) \
+ FN(skb_vlan_push, 18, ##ctx) \
+ FN(skb_vlan_pop, 19, ##ctx) \
+ FN(skb_get_tunnel_key, 20, ##ctx) \
+ FN(skb_set_tunnel_key, 21, ##ctx) \
+ FN(perf_event_read, 22, ##ctx) \
+ FN(redirect, 23, ##ctx) \
+ FN(get_route_realm, 24, ##ctx) \
+ FN(perf_event_output, 25, ##ctx) \
+ FN(skb_load_bytes, 26, ##ctx) \
+ FN(get_stackid, 27, ##ctx) \
+ FN(csum_diff, 28, ##ctx) \
+ FN(skb_get_tunnel_opt, 29, ##ctx) \
+ FN(skb_set_tunnel_opt, 30, ##ctx) \
+ FN(skb_change_proto, 31, ##ctx) \
+ FN(skb_change_type, 32, ##ctx) \
+ FN(skb_under_cgroup, 33, ##ctx) \
+ FN(get_hash_recalc, 34, ##ctx) \
+ FN(get_current_task, 35, ##ctx) \
+ FN(probe_write_user, 36, ##ctx) \
+ FN(current_task_under_cgroup, 37, ##ctx) \
+ FN(skb_change_tail, 38, ##ctx) \
+ FN(skb_pull_data, 39, ##ctx) \
+ FN(csum_update, 40, ##ctx) \
+ FN(set_hash_invalid, 41, ##ctx) \
+ FN(get_numa_node_id, 42, ##ctx) \
+ FN(skb_change_head, 43, ##ctx) \
+ FN(xdp_adjust_head, 44, ##ctx) \
+ FN(probe_read_str, 45, ##ctx) \
+ FN(get_socket_cookie, 46, ##ctx) \
+ FN(get_socket_uid, 47, ##ctx) \
+ FN(set_hash, 48, ##ctx) \
+ FN(setsockopt, 49, ##ctx) \
+ FN(skb_adjust_room, 50, ##ctx) \
+ FN(redirect_map, 51, ##ctx) \
+ FN(sk_redirect_map, 52, ##ctx) \
+ FN(sock_map_update, 53, ##ctx) \
+ FN(xdp_adjust_meta, 54, ##ctx) \
+ FN(perf_event_read_value, 55, ##ctx) \
+ FN(perf_prog_read_value, 56, ##ctx) \
+ FN(getsockopt, 57, ##ctx) \
+ FN(override_return, 58, ##ctx) \
+ FN(sock_ops_cb_flags_set, 59, ##ctx) \
+ FN(msg_redirect_map, 60, ##ctx) \
+ FN(msg_apply_bytes, 61, ##ctx) \
+ FN(msg_cork_bytes, 62, ##ctx) \
+ FN(msg_pull_data, 63, ##ctx) \
+ FN(bind, 64, ##ctx) \
+ FN(xdp_adjust_tail, 65, ##ctx) \
+ FN(skb_get_xfrm_state, 66, ##ctx) \
+ FN(get_stack, 67, ##ctx) \
+ FN(skb_load_bytes_relative, 68, ##ctx) \
+ FN(fib_lookup, 69, ##ctx) \
+ FN(sock_hash_update, 70, ##ctx) \
+ FN(msg_redirect_hash, 71, ##ctx) \
+ FN(sk_redirect_hash, 72, ##ctx) \
+ FN(lwt_push_encap, 73, ##ctx) \
+ FN(lwt_seg6_store_bytes, 74, ##ctx) \
+ FN(lwt_seg6_adjust_srh, 75, ##ctx) \
+ FN(lwt_seg6_action, 76, ##ctx) \
+ FN(rc_repeat, 77, ##ctx) \
+ FN(rc_keydown, 78, ##ctx) \
+ FN(skb_cgroup_id, 79, ##ctx) \
+ FN(get_current_cgroup_id, 80, ##ctx) \
+ FN(get_local_storage, 81, ##ctx) \
+ FN(sk_select_reuseport, 82, ##ctx) \
+ FN(skb_ancestor_cgroup_id, 83, ##ctx) \
+ FN(sk_lookup_tcp, 84, ##ctx) \
+ FN(sk_lookup_udp, 85, ##ctx) \
+ FN(sk_release, 86, ##ctx) \
+ FN(map_push_elem, 87, ##ctx) \
+ FN(map_pop_elem, 88, ##ctx) \
+ FN(map_peek_elem, 89, ##ctx) \
+ FN(msg_push_data, 90, ##ctx) \
+ FN(msg_pop_data, 91, ##ctx) \
+ FN(rc_pointer_rel, 92, ##ctx) \
+ FN(spin_lock, 93, ##ctx) \
+ FN(spin_unlock, 94, ##ctx) \
+ FN(sk_fullsock, 95, ##ctx) \
+ FN(tcp_sock, 96, ##ctx) \
+ FN(skb_ecn_set_ce, 97, ##ctx) \
+ FN(get_listener_sock, 98, ##ctx) \
+ FN(skc_lookup_tcp, 99, ##ctx) \
+ FN(tcp_check_syncookie, 100, ##ctx) \
+ FN(sysctl_get_name, 101, ##ctx) \
+ FN(sysctl_get_current_value, 102, ##ctx) \
+ FN(sysctl_get_new_value, 103, ##ctx) \
+ FN(sysctl_set_new_value, 104, ##ctx) \
+ FN(strtol, 105, ##ctx) \
+ FN(strtoul, 106, ##ctx) \
+ FN(sk_storage_get, 107, ##ctx) \
+ FN(sk_storage_delete, 108, ##ctx) \
+ FN(send_signal, 109, ##ctx) \
+ FN(tcp_gen_syncookie, 110, ##ctx) \
+ FN(skb_output, 111, ##ctx) \
+ FN(probe_read_user, 112, ##ctx) \
+ FN(probe_read_kernel, 113, ##ctx) \
+ FN(probe_read_user_str, 114, ##ctx) \
+ FN(probe_read_kernel_str, 115, ##ctx) \
+ FN(tcp_send_ack, 116, ##ctx) \
+ FN(send_signal_thread, 117, ##ctx) \
+ FN(jiffies64, 118, ##ctx) \
+ FN(read_branch_records, 119, ##ctx) \
+ FN(get_ns_current_pid_tgid, 120, ##ctx) \
+ FN(xdp_output, 121, ##ctx) \
+ FN(get_netns_cookie, 122, ##ctx) \
+ FN(get_current_ancestor_cgroup_id, 123, ##ctx) \
+ FN(sk_assign, 124, ##ctx) \
+ FN(ktime_get_boot_ns, 125, ##ctx) \
+ FN(seq_printf, 126, ##ctx) \
+ FN(seq_write, 127, ##ctx) \
+ FN(sk_cgroup_id, 128, ##ctx) \
+ FN(sk_ancestor_cgroup_id, 129, ##ctx) \
+ FN(ringbuf_output, 130, ##ctx) \
+ FN(ringbuf_reserve, 131, ##ctx) \
+ FN(ringbuf_submit, 132, ##ctx) \
+ FN(ringbuf_discard, 133, ##ctx) \
+ FN(ringbuf_query, 134, ##ctx) \
+ FN(csum_level, 135, ##ctx) \
+ FN(skc_to_tcp6_sock, 136, ##ctx) \
+ FN(skc_to_tcp_sock, 137, ##ctx) \
+ FN(skc_to_tcp_timewait_sock, 138, ##ctx) \
+ FN(skc_to_tcp_request_sock, 139, ##ctx) \
+ FN(skc_to_udp6_sock, 140, ##ctx) \
+ FN(get_task_stack, 141, ##ctx) \
+ FN(load_hdr_opt, 142, ##ctx) \
+ FN(store_hdr_opt, 143, ##ctx) \
+ FN(reserve_hdr_opt, 144, ##ctx) \
+ FN(inode_storage_get, 145, ##ctx) \
+ FN(inode_storage_delete, 146, ##ctx) \
+ FN(d_path, 147, ##ctx) \
+ FN(copy_from_user, 148, ##ctx) \
+ FN(snprintf_btf, 149, ##ctx) \
+ FN(seq_printf_btf, 150, ##ctx) \
+ FN(skb_cgroup_classid, 151, ##ctx) \
+ FN(redirect_neigh, 152, ##ctx) \
+ FN(per_cpu_ptr, 153, ##ctx) \
+ FN(this_cpu_ptr, 154, ##ctx) \
+ FN(redirect_peer, 155, ##ctx) \
+ FN(task_storage_get, 156, ##ctx) \
+ FN(task_storage_delete, 157, ##ctx) \
+ FN(get_current_task_btf, 158, ##ctx) \
+ FN(bprm_opts_set, 159, ##ctx) \
+ FN(ktime_get_coarse_ns, 160, ##ctx) \
+ FN(ima_inode_hash, 161, ##ctx) \
+ FN(sock_from_file, 162, ##ctx) \
+ FN(check_mtu, 163, ##ctx) \
+ FN(for_each_map_elem, 164, ##ctx) \
+ FN(snprintf, 165, ##ctx) \
+ FN(sys_bpf, 166, ##ctx) \
+ FN(btf_find_by_name_kind, 167, ##ctx) \
+ FN(sys_close, 168, ##ctx) \
+ FN(timer_init, 169, ##ctx) \
+ FN(timer_set_callback, 170, ##ctx) \
+ FN(timer_start, 171, ##ctx) \
+ FN(timer_cancel, 172, ##ctx) \
+ FN(get_func_ip, 173, ##ctx) \
+ FN(get_attach_cookie, 174, ##ctx) \
+ FN(task_pt_regs, 175, ##ctx) \
+ FN(get_branch_snapshot, 176, ##ctx) \
+ FN(trace_vprintk, 177, ##ctx) \
+ FN(skc_to_unix_sock, 178, ##ctx) \
+ FN(kallsyms_lookup_name, 179, ##ctx) \
+ FN(find_vma, 180, ##ctx) \
+ FN(loop, 181, ##ctx) \
+ FN(strncmp, 182, ##ctx) \
+ FN(get_func_arg, 183, ##ctx) \
+ FN(get_func_ret, 184, ##ctx) \
+ FN(get_func_arg_cnt, 185, ##ctx) \
+ FN(get_retval, 186, ##ctx) \
+ FN(set_retval, 187, ##ctx) \
+ FN(xdp_get_buff_len, 188, ##ctx) \
+ FN(xdp_load_bytes, 189, ##ctx) \
+ FN(xdp_store_bytes, 190, ##ctx) \
+ FN(copy_from_user_task, 191, ##ctx) \
+ FN(skb_set_tstamp, 192, ##ctx) \
+ FN(ima_file_hash, 193, ##ctx) \
+ FN(kptr_xchg, 194, ##ctx) \
+ FN(map_lookup_percpu_elem, 195, ##ctx) \
+ FN(skc_to_mptcp_sock, 196, ##ctx) \
+ FN(dynptr_from_mem, 197, ##ctx) \
+ FN(ringbuf_reserve_dynptr, 198, ##ctx) \
+ FN(ringbuf_submit_dynptr, 199, ##ctx) \
+ FN(ringbuf_discard_dynptr, 200, ##ctx) \
+ FN(dynptr_read, 201, ##ctx) \
+ FN(dynptr_write, 202, ##ctx) \
+ FN(dynptr_data, 203, ##ctx) \
+ FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \
+ FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \
+ FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \
+ FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \
+ FN(ktime_get_tai_ns, 208, ##ctx) \
+ FN(user_ringbuf_drain, 209, ##ctx) \
+ FN(cgrp_storage_get, 210, ##ctx) \
+ FN(cgrp_storage_delete, 211, ##ctx) \
+ /* */
+
+/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
+ * know or care about integer value that is now passed as second argument
+ */
+#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name),
+#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN)
+
+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y,
+enum bpf_func_id {
+ ___BPF_FUNC_MAPPER(__BPF_ENUM_FN)
+ __BPF_FUNC_MAX_ID,
+};
+#undef __BPF_ENUM_FN
+
+/* All flags used by eBPF helper functions, placed here. */
+
+/* BPF_FUNC_skb_store_bytes flags. */
+enum {
+ BPF_F_RECOMPUTE_CSUM = (1ULL << 0),
+ BPF_F_INVALIDATE_HASH = (1ULL << 1),
+};
+
+/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
+ * First 4 bits are for passing the header field size.
+ */
+enum {
+ BPF_F_HDR_FIELD_MASK = 0xfULL,
+};
+
+/* BPF_FUNC_l4_csum_replace flags. */
+enum {
+ BPF_F_PSEUDO_HDR = (1ULL << 4),
+ BPF_F_MARK_MANGLED_0 = (1ULL << 5),
+ BPF_F_MARK_ENFORCE = (1ULL << 6),
+};
+
+/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
+enum {
+ BPF_F_INGRESS = (1ULL << 0),
+};
+
+/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
+enum {
+ BPF_F_TUNINFO_IPV6 = (1ULL << 0),
+};
+
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
+enum {
+ BPF_F_SKIP_FIELD_MASK = 0xffULL,
+ BPF_F_USER_STACK = (1ULL << 8),
+/* flags used by BPF_FUNC_get_stackid only. */
+ BPF_F_FAST_STACK_CMP = (1ULL << 9),
+ BPF_F_REUSE_STACKID = (1ULL << 10),
+/* flags used by BPF_FUNC_get_stack only. */
+ BPF_F_USER_BUILD_ID = (1ULL << 11),
+};
+
+/* BPF_FUNC_skb_set_tunnel_key flags. */
+enum {
+ BPF_F_ZERO_CSUM_TX = (1ULL << 1),
+ BPF_F_DONT_FRAGMENT = (1ULL << 2),
+ BPF_F_SEQ_NUMBER = (1ULL << 3),
+};
+
+/* BPF_FUNC_skb_get_tunnel_key flags. */
+enum {
+ BPF_F_TUNINFO_FLAGS = (1ULL << 4),
+};
+
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
+enum {
+ BPF_F_INDEX_MASK = 0xffffffffULL,
+ BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK,
+/* BPF_FUNC_perf_event_output for sk_buff input context. */
+ BPF_F_CTXLEN_MASK = (0xfffffULL << 32),
+};
+
+/* Current network namespace */
+enum {
+ BPF_F_CURRENT_NETNS = (-1L),
+};
+
+/* BPF_FUNC_csum_level level values. */
+enum {
+ BPF_CSUM_LEVEL_QUERY,
+ BPF_CSUM_LEVEL_INC,
+ BPF_CSUM_LEVEL_DEC,
+ BPF_CSUM_LEVEL_RESET,
+};
+
+/* BPF_FUNC_skb_adjust_room flags. */
+enum {
+ BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0),
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1),
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2),
+ BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3),
+ BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
+};
+
+enum {
+ BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff,
+ BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56,
+};
+
+#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \
+ BPF_ADJ_ROOM_ENCAP_L2_MASK) \
+ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT)
+
+/* BPF_FUNC_sysctl_get_name flags. */
+enum {
+ BPF_F_SYSCTL_BASE_NAME = (1ULL << 0),
+};
+
+/* BPF_FUNC_<kernel_obj>_storage_get flags */
+enum {
+ BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0),
+ /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility
+ * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead.
+ */
+ BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE,
+};
+
+/* BPF_FUNC_read_branch_records flags. */
+enum {
+ BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0),
+};
+
+/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
+ * BPF_FUNC_bpf_ringbuf_output flags.
+ */
+enum {
+ BPF_RB_NO_WAKEUP = (1ULL << 0),
+ BPF_RB_FORCE_WAKEUP = (1ULL << 1),
+};
+
+/* BPF_FUNC_bpf_ringbuf_query flags */
+enum {
+ BPF_RB_AVAIL_DATA = 0,
+ BPF_RB_RING_SIZE = 1,
+ BPF_RB_CONS_POS = 2,
+ BPF_RB_PROD_POS = 3,
+};
+
+/* BPF ring buffer constants */
+enum {
+ BPF_RINGBUF_BUSY_BIT = (1U << 31),
+ BPF_RINGBUF_DISCARD_BIT = (1U << 30),
+ BPF_RINGBUF_HDR_SZ = 8,
+};
+
+/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
+enum {
+ BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0),
+ BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1),
+};
+
+/* Mode for BPF_FUNC_skb_adjust_room helper. */
+enum bpf_adj_room_mode {
+ BPF_ADJ_ROOM_NET,
+ BPF_ADJ_ROOM_MAC,
+};
+
+/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
+enum bpf_hdr_start_off {
+ BPF_HDR_START_MAC,
+ BPF_HDR_START_NET,
+};
+
+/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
+enum bpf_lwt_encap_mode {
+ BPF_LWT_ENCAP_SEG6,
+ BPF_LWT_ENCAP_SEG6_INLINE,
+ BPF_LWT_ENCAP_IP,
+};
+
+/* Flags for bpf_bprm_opts_set helper */
+enum {
+ BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
+};
+
+/* Flags for bpf_redirect_map helper */
+enum {
+ BPF_F_BROADCAST = (1ULL << 3),
+ BPF_F_EXCLUDE_INGRESS = (1ULL << 4),
+};
+
+#define __bpf_md_ptr(type, name) \
+union { \
+ type name; \
+ __u64 :64; \
+} __attribute__((aligned(8)))
+
+enum {
+ BPF_SKB_TSTAMP_UNSPEC,
+ BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */
+ /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
+ * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
+ * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
+ */
+};
+
+/* user accessible mirror of in-kernel sk_buff.
+ * new fields can only be added to the end of this structure
+ */
+struct __sk_buff {
+ __u32 len;
+ __u32 pkt_type;
+ __u32 mark;
+ __u32 queue_mapping;
+ __u32 protocol;
+ __u32 vlan_present;
+ __u32 vlan_tci;
+ __u32 vlan_proto;
+ __u32 priority;
+ __u32 ingress_ifindex;
+ __u32 ifindex;
+ __u32 tc_index;
+ __u32 cb[5];
+ __u32 hash;
+ __u32 tc_classid;
+ __u32 data;
+ __u32 data_end;
+ __u32 napi_id;
+
+ /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
+ __u32 family;
+ __u32 remote_ip4; /* Stored in network byte order */
+ __u32 local_ip4; /* Stored in network byte order */
+ __u32 remote_ip6[4]; /* Stored in network byte order */
+ __u32 local_ip6[4]; /* Stored in network byte order */
+ __u32 remote_port; /* Stored in network byte order */
+ __u32 local_port; /* stored in host byte order */
+ /* ... here. */
+
+ __u32 data_meta;
+ __bpf_md_ptr(struct bpf_flow_keys *, flow_keys);
+ __u64 tstamp;
+ __u32 wire_len;
+ __u32 gso_segs;
+ __bpf_md_ptr(struct bpf_sock *, sk);
+ __u32 gso_size;
+ __u8 tstamp_type;
+ __u32 :24; /* Padding, future use. */
+ __u64 hwtstamp;
+};
+
+struct bpf_tunnel_key {
+ __u32 tunnel_id;
+ union {
+ __u32 remote_ipv4;
+ __u32 remote_ipv6[4];
+ };
+ __u8 tunnel_tos;
+ __u8 tunnel_ttl;
+ union {
+ __u16 tunnel_ext; /* compat */
+ __be16 tunnel_flags;
+ };
+ __u32 tunnel_label;
+ union {
+ __u32 local_ipv4;
+ __u32 local_ipv6[4];
+ };
+};
+
+/* user accessible mirror of in-kernel xfrm_state.
+ * new fields can only be added to the end of this structure
+ */
+struct bpf_xfrm_state {
+ __u32 reqid;
+ __u32 spi; /* Stored in network byte order */
+ __u16 family;
+ __u16 ext; /* Padding, future use. */
+ union {
+ __u32 remote_ipv4; /* Stored in network byte order */
+ __u32 remote_ipv6[4]; /* Stored in network byte order */
+ };
+};
+
+/* Generic BPF return codes which all BPF program types may support.
+ * The values are binary compatible with their TC_ACT_* counter-part to
+ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT
+ * programs.
+ *
+ * XDP is handled seprately, see XDP_*.
+ */
+enum bpf_ret_code {
+ BPF_OK = 0,
+ /* 1 reserved */
+ BPF_DROP = 2,
+ /* 3-6 reserved */
+ BPF_REDIRECT = 7,
+ /* >127 are reserved for prog type specific return codes.
+ *
+ * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+ * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+ * changed and should be routed based on its new L3 header.
+ * (This is an L3 redirect, as opposed to L2 redirect
+ * represented by BPF_REDIRECT above).
+ */
+ BPF_LWT_REROUTE = 128,
+ /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR
+ * to indicate that no custom dissection was performed, and
+ * fallback to standard dissector is requested.
+ */
+ BPF_FLOW_DISSECTOR_CONTINUE = 129,
+};
+
+struct bpf_sock {
+ __u32 bound_dev_if;
+ __u32 family;
+ __u32 type;
+ __u32 protocol;
+ __u32 mark;
+ __u32 priority;
+ /* IP address also allows 1 and 2 bytes access */
+ __u32 src_ip4;
+ __u32 src_ip6[4];
+ __u32 src_port; /* host byte order */
+ __be16 dst_port; /* network byte order */
+ __u16 :16; /* zero padding */
+ __u32 dst_ip4;
+ __u32 dst_ip6[4];
+ __u32 state;
+ __s32 rx_queue_mapping;
+};
+
+struct bpf_tcp_sock {
+ __u32 snd_cwnd; /* Sending congestion window */
+ __u32 srtt_us; /* smoothed round trip time << 3 in usecs */
+ __u32 rtt_min;
+ __u32 snd_ssthresh; /* Slow start size threshold */
+ __u32 rcv_nxt; /* What we want to receive next */
+ __u32 snd_nxt; /* Next sequence we send */
+ __u32 snd_una; /* First byte we want an ack for */
+ __u32 mss_cache; /* Cached effective mss, not including SACKS */
+ __u32 ecn_flags; /* ECN status bits. */
+ __u32 rate_delivered; /* saved rate sample: packets delivered */
+ __u32 rate_interval_us; /* saved rate sample: time elapsed */
+ __u32 packets_out; /* Packets which are "in flight" */
+ __u32 retrans_out; /* Retransmitted packets out */
+ __u32 total_retrans; /* Total retransmits for entire connection */
+ __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
+ * total number of segments in.
+ */
+ __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
+ * total number of data segments in.
+ */
+ __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
+ * The total number of segments sent.
+ */
+ __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
+ * total number of data segments sent.
+ */
+ __u32 lost_out; /* Lost packets */
+ __u32 sacked_out; /* SACK'd packets */
+ __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
+ * sum(delta(rcv_nxt)), or how many bytes
+ * were acked.
+ */
+ __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
+ * sum(delta(snd_una)), or how many bytes
+ * were acked.
+ */
+ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
+ * total number of DSACK blocks received
+ */
+ __u32 delivered; /* Total data packets delivered incl. rexmits */
+ __u32 delivered_ce; /* Like the above but only ECE marked packets */
+ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
+};
+
+struct bpf_sock_tuple {
+ union {
+ struct {
+ __be32 saddr;
+ __be32 daddr;
+ __be16 sport;
+ __be16 dport;
+ } ipv4;
+ struct {
+ __be32 saddr[4];
+ __be32 daddr[4];
+ __be16 sport;
+ __be16 dport;
+ } ipv6;
+ };
+};
+
+struct bpf_xdp_sock {
+ __u32 queue_id;
+};
+
+#define XDP_PACKET_HEADROOM 256
+
+/* User return codes for XDP prog type.
+ * A valid XDP program must return one of these defined values. All other
+ * return codes are reserved for future use. Unknown return codes will
+ * result in packet drops and a warning via bpf_warn_invalid_xdp_action().
+ */
+enum xdp_action {
+ XDP_ABORTED = 0,
+ XDP_DROP,
+ XDP_PASS,
+ XDP_TX,
+ XDP_REDIRECT,
+};
+
+/* user accessible metadata for XDP packet hook
+ * new fields must be added to the end of this structure
+ */
+struct xdp_md {
+ __u32 data;
+ __u32 data_end;
+ __u32 data_meta;
+ /* Below access go through struct xdp_rxq_info */
+ __u32 ingress_ifindex; /* rxq->dev->ifindex */
+ __u32 rx_queue_index; /* rxq->queue_index */
+
+ __u32 egress_ifindex; /* txq->dev->ifindex */
+};
+
+/* DEVMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_devmap_val {
+ __u32 ifindex; /* device index */
+ union {
+ int fd; /* prog fd on map write */
+ __u32 id; /* prog id on map read */
+ } bpf_prog;
+};
+
+/* CPUMAP map-value layout
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * New members can only be added to the end of this structure.
+ */
+struct bpf_cpumap_val {
+ __u32 qsize; /* queue size to remote target CPU */
+ union {
+ int fd; /* prog fd on map write */
+ __u32 id; /* prog id on map read */
+ } bpf_prog;
+};
+
+enum sk_action {
+ SK_DROP = 0,
+ SK_PASS,
+};
+
+/* user accessible metadata for SK_MSG packet hook, new fields must
+ * be added to the end of this structure
+ */
+struct sk_msg_md {
+ __bpf_md_ptr(void *, data);
+ __bpf_md_ptr(void *, data_end);
+
+ __u32 family;
+ __u32 remote_ip4; /* Stored in network byte order */
+ __u32 local_ip4; /* Stored in network byte order */
+ __u32 remote_ip6[4]; /* Stored in network byte order */
+ __u32 local_ip6[4]; /* Stored in network byte order */
+ __u32 remote_port; /* Stored in network byte order */
+ __u32 local_port; /* stored in host byte order */
+ __u32 size; /* Total size of sk_msg */
+
+ __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */
+};
+
+struct sk_reuseport_md {
+ /*
+ * Start of directly accessible data. It begins from
+ * the tcp/udp header.
+ */
+ __bpf_md_ptr(void *, data);
+ /* End of directly accessible data */
+ __bpf_md_ptr(void *, data_end);
+ /*
+ * Total length of packet (starting from the tcp/udp header).
+ * Note that the directly accessible bytes (data_end - data)
+ * could be less than this "len". Those bytes could be
+ * indirectly read by a helper "bpf_skb_load_bytes()".
+ */
+ __u32 len;
+ /*
+ * Eth protocol in the mac header (network byte order). e.g.
+ * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD)
+ */
+ __u32 eth_protocol;
+ __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */
+ __u32 bind_inany; /* Is sock bound to an INANY address? */
+ __u32 hash; /* A hash of the packet 4 tuples */
+ /* When reuse->migrating_sk is NULL, it is selecting a sk for the
+ * new incoming connection request (e.g. selecting a listen sk for
+ * the received SYN in the TCP case). reuse->sk is one of the sk
+ * in the reuseport group. The bpf prog can use reuse->sk to learn
+ * the local listening ip/port without looking into the skb.
+ *
+ * When reuse->migrating_sk is not NULL, reuse->sk is closed and
+ * reuse->migrating_sk is the socket that needs to be migrated
+ * to another listening socket. migrating_sk could be a fullsock
+ * sk that is fully established or a reqsk that is in-the-middle
+ * of 3-way handshake.
+ */
+ __bpf_md_ptr(struct bpf_sock *, sk);
+ __bpf_md_ptr(struct bpf_sock *, migrating_sk);
+};
+
+#define BPF_TAG_SIZE 8
+
+struct bpf_prog_info {
+ __u32 type;
+ __u32 id;
+ __u8 tag[BPF_TAG_SIZE];
+ __u32 jited_prog_len;
+ __u32 xlated_prog_len;
+ __aligned_u64 jited_prog_insns;
+ __aligned_u64 xlated_prog_insns;
+ __u64 load_time; /* ns since boottime */
+ __u32 created_by_uid;
+ __u32 nr_map_ids;
+ __aligned_u64 map_ids;
+ char name[BPF_OBJ_NAME_LEN];
+ __u32 ifindex;
+ __u32 gpl_compatible:1;
+ __u32 :31; /* alignment pad */
+ __u64 netns_dev;
+ __u64 netns_ino;
+ __u32 nr_jited_ksyms;
+ __u32 nr_jited_func_lens;
+ __aligned_u64 jited_ksyms;
+ __aligned_u64 jited_func_lens;
+ __u32 btf_id;
+ __u32 func_info_rec_size;
+ __aligned_u64 func_info;
+ __u32 nr_func_info;
+ __u32 nr_line_info;
+ __aligned_u64 line_info;
+ __aligned_u64 jited_line_info;
+ __u32 nr_jited_line_info;
+ __u32 line_info_rec_size;
+ __u32 jited_line_info_rec_size;
+ __u32 nr_prog_tags;
+ __aligned_u64 prog_tags;
+ __u64 run_time_ns;
+ __u64 run_cnt;
+ __u64 recursion_misses;
+ __u32 verified_insns;
+ __u32 attach_btf_obj_id;
+ __u32 attach_btf_id;
+} __attribute__((aligned(8)));
+
+struct bpf_map_info {
+ __u32 type;
+ __u32 id;
+ __u32 key_size;
+ __u32 value_size;
+ __u32 max_entries;
+ __u32 map_flags;
+ char name[BPF_OBJ_NAME_LEN];
+ __u32 ifindex;
+ __u32 btf_vmlinux_value_type_id;
+ __u64 netns_dev;
+ __u64 netns_ino;
+ __u32 btf_id;
+ __u32 btf_key_type_id;
+ __u32 btf_value_type_id;
+ __u32 :32; /* alignment pad */
+ __u64 map_extra;
+} __attribute__((aligned(8)));
+
+struct bpf_btf_info {
+ __aligned_u64 btf;
+ __u32 btf_size;
+ __u32 id;
+ __aligned_u64 name;
+ __u32 name_len;
+ __u32 kernel_btf;
+} __attribute__((aligned(8)));
+
+struct bpf_link_info {
+ __u32 type;
+ __u32 id;
+ __u32 prog_id;
+ union {
+ struct {
+ __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */
+ __u32 tp_name_len; /* in/out: tp_name buffer len */
+ } raw_tracepoint;
+ struct {
+ __u32 attach_type;
+ __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */
+ __u32 target_btf_id; /* BTF type id inside the object */
+ } tracing;
+ struct {
+ __u64 cgroup_id;
+ __u32 attach_type;
+ } cgroup;
+ struct {
+ __aligned_u64 target_name; /* in/out: target_name buffer ptr */
+ __u32 target_name_len; /* in/out: target_name buffer len */
+
+ /* If the iter specific field is 32 bits, it can be put
+ * in the first or second union. Otherwise it should be
+ * put in the second union.
+ */
+ union {
+ struct {
+ __u32 map_id;
+ } map;
+ };
+ union {
+ struct {
+ __u64 cgroup_id;
+ __u32 order;
+ } cgroup;
+ struct {
+ __u32 tid;
+ __u32 pid;
+ } task;
+ };
+ } iter;
+ struct {
+ __u32 netns_ino;
+ __u32 attach_type;
+ } netns;
+ struct {
+ __u32 ifindex;
+ } xdp;
+ };
+} __attribute__((aligned(8)));
+
+/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
+ * by user and intended to be used by socket (e.g. to bind to, depends on
+ * attach type).
+ */
+struct bpf_sock_addr {
+ __u32 user_family; /* Allows 4-byte read, but no write. */
+ __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.
+ * Stored in network byte order.
+ */
+ __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
+ * Stored in network byte order.
+ */
+ __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
+ * Stored in network byte order
+ */
+ __u32 family; /* Allows 4-byte read, but no write */
+ __u32 type; /* Allows 4-byte read, but no write */
+ __u32 protocol; /* Allows 4-byte read, but no write */
+ __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write.
+ * Stored in network byte order.
+ */
+ __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
+ * Stored in network byte order.
+ */
+ __bpf_md_ptr(struct bpf_sock *, sk);
+};
+
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * Some of this fields are in network (bigendian) byte order and may need
+ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h).
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+ __u32 op;
+ union {
+ __u32 args[4]; /* Optionally passed to bpf program */
+ __u32 reply; /* Returned by bpf program */
+ __u32 replylong[4]; /* Optionally returned by bpf prog */
+ };
+ __u32 family;
+ __u32 remote_ip4; /* Stored in network byte order */
+ __u32 local_ip4; /* Stored in network byte order */
+ __u32 remote_ip6[4]; /* Stored in network byte order */
+ __u32 local_ip6[4]; /* Stored in network byte order */
+ __u32 remote_port; /* Stored in network byte order */
+ __u32 local_port; /* stored in host byte order */
+ __u32 is_fullsock; /* Some TCP fields are only valid if
+ * there is a full socket. If not, the
+ * fields read as zero.
+ */
+ __u32 snd_cwnd;
+ __u32 srtt_us; /* Averaged RTT << 3 in usecs */
+ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */
+ __u32 state;
+ __u32 rtt_min;
+ __u32 snd_ssthresh;
+ __u32 rcv_nxt;
+ __u32 snd_nxt;
+ __u32 snd_una;
+ __u32 mss_cache;
+ __u32 ecn_flags;
+ __u32 rate_delivered;
+ __u32 rate_interval_us;
+ __u32 packets_out;
+ __u32 retrans_out;
+ __u32 total_retrans;
+ __u32 segs_in;
+ __u32 data_segs_in;
+ __u32 segs_out;
+ __u32 data_segs_out;
+ __u32 lost_out;
+ __u32 sacked_out;
+ __u32 sk_txhash;
+ __u64 bytes_received;
+ __u64 bytes_acked;
+ __bpf_md_ptr(struct bpf_sock *, sk);
+ /* [skb_data, skb_data_end) covers the whole TCP header.
+ *
+ * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received
+ * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the
+ * header has not been written.
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have
+ * been written so far.
+ * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes
+ * the 3WHS.
+ * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes
+ * the 3WHS.
+ *
+ * bpf_load_hdr_opt() can also be used to read a particular option.
+ */
+ __bpf_md_ptr(void *, skb_data);
+ __bpf_md_ptr(void *, skb_data_end);
+ __u32 skb_len; /* The total length of a packet.
+ * It includes the header, options,
+ * and payload.
+ */
+ __u32 skb_tcp_flags; /* tcp_flags of the header. It provides
+ * an easy way to check for tcp_flags
+ * without parsing skb_data.
+ *
+ * In particular, the skb_tcp_flags
+ * will still be available in
+ * BPF_SOCK_OPS_HDR_OPT_LEN even though
+ * the outgoing header has not
+ * been written yet.
+ */
+ __u64 skb_hwtstamp;
+};
+
+/* Definitions for bpf_sock_ops_cb_flags */
+enum {
+ BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0),
+ BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1),
+ BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2),
+ BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3),
+ /* Call bpf for all received TCP headers. The bpf prog will be
+ * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+ *
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+ * for the header option related helpers that will be useful
+ * to the bpf programs.
+ *
+ * It could be used at the client/active side (i.e. connect() side)
+ * when the server told it that the server was in syncookie
+ * mode and required the active side to resend the bpf-written
+ * options. The active side can keep writing the bpf-options until
+ * it received a valid packet from the server side to confirm
+ * the earlier packet (and options) has been received. The later
+ * example patch is using it like this at the active side when the
+ * server is in syncookie mode.
+ *
+ * The bpf prog will usually turn this off in the common cases.
+ */
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4),
+ /* Call bpf when kernel has received a header option that
+ * the kernel cannot handle. The bpf prog will be called under
+ * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB.
+ *
+ * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB
+ * for the header option related helpers that will be useful
+ * to the bpf programs.
+ */
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5),
+ /* Call bpf when the kernel is writing header options for the
+ * outgoing packet. The bpf prog will first be called
+ * to reserve space in a skb under
+ * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then
+ * the bpf prog will be called to write the header option(s)
+ * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ *
+ * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB
+ * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option
+ * related helpers that will be useful to the bpf programs.
+ *
+ * The kernel gets its chance to reserve space and write
+ * options first before the BPF program does.
+ */
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6),
+/* Mask of all currently supported cb flags */
+ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F,
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+ BPF_SOCK_OPS_VOID,
+ BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or
+ * -1 if default value should be used
+ */
+ BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
+ * window (in packets) or -1 if default
+ * value should be used
+ */
+ BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an
+ * active connection is initialized
+ */
+ BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
+ * active connection is
+ * established
+ */
+ BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a
+ * passive connection is
+ * established
+ */
+ BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
+ * needs ECN
+ */
+ BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is
+ * based on the path and may be
+ * dependent on the congestion control
+ * algorithm. In general it indicates
+ * a congestion threshold. RTTs above
+ * this indicate congestion
+ */
+ BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered.
+ * Arg1: value of icsk_retransmits
+ * Arg2: value of icsk_rto
+ * Arg3: whether RTO has expired
+ */
+ BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted.
+ * Arg1: sequence number of 1st byte
+ * Arg2: # segments
+ * Arg3: return value of
+ * tcp_transmit_skb (0 => success)
+ */
+ BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state.
+ * Arg1: old_state
+ * Arg2: new_state
+ */
+ BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
+ * socket transition to LISTEN state.
+ */
+ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
+ */
+ BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option.
+ * It will be called to handle
+ * the packets received at
+ * an already established
+ * connection.
+ *
+ * sock_ops->skb_data:
+ * Referring to the received skb.
+ * It covers the TCP header only.
+ *
+ * bpf_load_hdr_opt() can also
+ * be used to search for a
+ * particular option.
+ */
+ BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the
+ * header option later in
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ * Arg1: bool want_cookie. (in
+ * writing SYNACK only)
+ *
+ * sock_ops->skb_data:
+ * Not available because no header has
+ * been written yet.
+ *
+ * sock_ops->skb_tcp_flags:
+ * The tcp_flags of the
+ * outgoing skb. (e.g. SYN, ACK, FIN).
+ *
+ * bpf_reserve_hdr_opt() should
+ * be used to reserve space.
+ */
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options
+ * Arg1: bool want_cookie. (in
+ * writing SYNACK only)
+ *
+ * sock_ops->skb_data:
+ * Referring to the outgoing skb.
+ * It covers the TCP header
+ * that has already been written
+ * by the kernel and the
+ * earlier bpf-progs.
+ *
+ * sock_ops->skb_tcp_flags:
+ * The tcp_flags of the outgoing
+ * skb. (e.g. SYN, ACK, FIN).
+ *
+ * bpf_store_hdr_opt() should
+ * be used to write the
+ * option.
+ *
+ * bpf_load_hdr_opt() can also
+ * be used to search for a
+ * particular option that
+ * has already been written
+ * by the kernel or the
+ * earlier bpf-progs.
+ */
+};
+
+/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+ * changes between the TCP and BPF versions. Ideally this should never happen.
+ * If it does, we need to add code to convert them before calling
+ * the BPF sock_ops function.
+ */
+enum {
+ BPF_TCP_ESTABLISHED = 1,
+ BPF_TCP_SYN_SENT,
+ BPF_TCP_SYN_RECV,
+ BPF_TCP_FIN_WAIT1,
+ BPF_TCP_FIN_WAIT2,
+ BPF_TCP_TIME_WAIT,
+ BPF_TCP_CLOSE,
+ BPF_TCP_CLOSE_WAIT,
+ BPF_TCP_LAST_ACK,
+ BPF_TCP_LISTEN,
+ BPF_TCP_CLOSING, /* Now a valid state */
+ BPF_TCP_NEW_SYN_RECV,
+
+ BPF_TCP_MAX_STATES /* Leave at the end! */
+};
+
+enum {
+ TCP_BPF_IW = 1001, /* Set TCP initial congestion window */
+ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */
+ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */
+ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */
+ /* Copy the SYN pkt to optval
+ *
+ * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the
+ * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit
+ * to only getting from the saved_syn. It can either get the
+ * syn packet from:
+ *
+ * 1. the just-received SYN packet (only available when writing the
+ * SYNACK). It will be useful when it is not necessary to
+ * save the SYN packet for latter use. It is also the only way
+ * to get the SYN during syncookie mode because the syn
+ * packet cannot be saved during syncookie.
+ *
+ * OR
+ *
+ * 2. the earlier saved syn which was done by
+ * bpf_setsockopt(TCP_SAVE_SYN).
+ *
+ * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the
+ * SYN packet is obtained.
+ *
+ * If the bpf-prog does not need the IP[46] header, the
+ * bpf-prog can avoid parsing the IP header by using
+ * TCP_BPF_SYN. Otherwise, the bpf-prog can get both
+ * IP[46] and TCP header by using TCP_BPF_SYN_IP.
+ *
+ * >0: Total number of bytes copied
+ * -ENOSPC: Not enough space in optval. Only optlen number of
+ * bytes is copied.
+ * -ENOENT: The SYN skb is not available now and the earlier SYN pkt
+ * is not saved by setsockopt(TCP_SAVE_SYN).
+ */
+ TCP_BPF_SYN = 1005, /* Copy the TCP header */
+ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
+ TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
+};
+
+enum {
+ BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0),
+};
+
+/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and
+ * BPF_SOCK_OPS_WRITE_HDR_OPT_CB.
+ */
+enum {
+ BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the
+ * total option spaces
+ * required for an established
+ * sk in order to calculate the
+ * MSS. No skb is actually
+ * sent.
+ */
+ BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode
+ * when sending a SYN.
+ */
+};
+
+struct bpf_perf_event_value {
+ __u64 counter;
+ __u64 enabled;
+ __u64 running;
+};
+
+enum {
+ BPF_DEVCG_ACC_MKNOD = (1ULL << 0),
+ BPF_DEVCG_ACC_READ = (1ULL << 1),
+ BPF_DEVCG_ACC_WRITE = (1ULL << 2),
+};
+
+enum {
+ BPF_DEVCG_DEV_BLOCK = (1ULL << 0),
+ BPF_DEVCG_DEV_CHAR = (1ULL << 1),
+};
+
+struct bpf_cgroup_dev_ctx {
+ /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */
+ __u32 access_type;
+ __u32 major;
+ __u32 minor;
+};
+
+struct bpf_raw_tracepoint_args {
+ __u64 args[0];
+};
+
+/* DIRECT: Skip the FIB rules and go to FIB table associated with device
+ * OUTPUT: Do lookup from egress perspective; default is ingress
+ */
+enum {
+ BPF_FIB_LOOKUP_DIRECT = (1U << 0),
+ BPF_FIB_LOOKUP_OUTPUT = (1U << 1),
+};
+
+enum {
+ BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */
+ BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */
+ BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */
+ BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */
+ BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */
+ BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */
+ BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */
+ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */
+ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */
+};
+
+struct bpf_fib_lookup {
+ /* input: network family for lookup (AF_INET, AF_INET6)
+ * output: network family of egress nexthop
+ */
+ __u8 family;
+
+ /* set if lookup is to consider L4 data - e.g., FIB rules */
+ __u8 l4_protocol;
+ __be16 sport;
+ __be16 dport;
+
+ union { /* used for MTU check */
+ /* input to lookup */
+ __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */
+
+ /* output: MTU value */
+ __u16 mtu_result;
+ };
+ /* input: L3 device index for lookup
+ * output: device index from FIB lookup
+ */
+ __u32 ifindex;
+
+ union {
+ /* inputs to lookup */
+ __u8 tos; /* AF_INET */
+ __be32 flowinfo; /* AF_INET6, flow_label + priority */
+
+ /* output: metric of fib result (IPv4/IPv6 only) */
+ __u32 rt_metric;
+ };
+
+ union {
+ __be32 ipv4_src;
+ __u32 ipv6_src[4]; /* in6_addr; network order */
+ };
+
+ /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in
+ * network header. output: bpf_fib_lookup sets to gateway address
+ * if FIB lookup returns gateway route
+ */
+ union {
+ __be32 ipv4_dst;
+ __u32 ipv6_dst[4]; /* in6_addr; network order */
+ };
+
+ /* output */
+ __be16 h_vlan_proto;
+ __be16 h_vlan_TCI;
+ __u8 smac[6]; /* ETH_ALEN */
+ __u8 dmac[6]; /* ETH_ALEN */
+};
+
+struct bpf_redir_neigh {
+ /* network family for lookup (AF_INET, AF_INET6) */
+ __u32 nh_family;
+ /* network address of nexthop; skips fib lookup to find gateway */
+ union {
+ __be32 ipv4_nh;
+ __u32 ipv6_nh[4]; /* in6_addr; network order */
+ };
+};
+
+/* bpf_check_mtu flags*/
+enum bpf_check_mtu_flags {
+ BPF_MTU_CHK_SEGS = (1U << 0),
+};
+
+enum bpf_check_mtu_ret {
+ BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */
+ BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */
+ BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */
+};
+
+enum bpf_task_fd_type {
+ BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
+ BPF_FD_TYPE_TRACEPOINT, /* tp name */
+ BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
+ BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */
+ BPF_FD_TYPE_UPROBE, /* filename + offset */
+ BPF_FD_TYPE_URETPROBE, /* filename + offset */
+};
+
+enum {
+ BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0),
+ BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1),
+ BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2),
+};
+
+struct bpf_flow_keys {
+ __u16 nhoff;
+ __u16 thoff;
+ __u16 addr_proto; /* ETH_P_* of valid addrs */
+ __u8 is_frag;
+ __u8 is_first_frag;
+ __u8 is_encap;
+ __u8 ip_proto;
+ __be16 n_proto;
+ __be16 sport;
+ __be16 dport;
+ union {
+ struct {
+ __be32 ipv4_src;
+ __be32 ipv4_dst;
+ };
+ struct {
+ __u32 ipv6_src[4]; /* in6_addr; network order */
+ __u32 ipv6_dst[4]; /* in6_addr; network order */
+ };
+ };
+ __u32 flags;
+ __be32 flow_label;
+};
+
+struct bpf_func_info {
+ __u32 insn_off;
+ __u32 type_id;
+};
+
+#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10)
+#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff)
+
+struct bpf_line_info {
+ __u32 insn_off;
+ __u32 file_name_off;
+ __u32 line_off;
+ __u32 line_col;
+};
+
+struct bpf_spin_lock {
+ __u32 val;
+};
+
+struct bpf_timer {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_dynptr {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_list_head {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_list_node {
+ __u64 :64;
+ __u64 :64;
+} __attribute__((aligned(8)));
+
+struct bpf_sysctl {
+ __u32 write; /* Sysctl is being read (= 0) or written (= 1).
+ * Allows 1,2,4-byte read, but no write.
+ */
+ __u32 file_pos; /* Sysctl file position to read from, write to.
+ * Allows 1,2,4-byte read an 4-byte write.
+ */
+};
+
+struct bpf_sockopt {
+ __bpf_md_ptr(struct bpf_sock *, sk);
+ __bpf_md_ptr(void *, optval);
+ __bpf_md_ptr(void *, optval_end);
+
+ __s32 level;
+ __s32 optname;
+ __s32 optlen;
+ __s32 retval;
+};
+
+struct bpf_pidns_info {
+ __u32 pid;
+ __u32 tgid;
+};
+
+/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
+struct bpf_sk_lookup {
+ union {
+ __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+ __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+ };
+
+ __u32 family; /* Protocol family (AF_INET, AF_INET6) */
+ __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
+ __u32 remote_ip4; /* Network byte order */
+ __u32 remote_ip6[4]; /* Network byte order */
+ __be16 remote_port; /* Network byte order */
+ __u16 :16; /* Zero padding */
+ __u32 local_ip4; /* Network byte order */
+ __u32 local_ip6[4]; /* Network byte order */
+ __u32 local_port; /* Host byte order */
+ __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */
+};
+
+/*
+ * struct btf_ptr is used for typed pointer representation; the
+ * type id is used to render the pointer data as the appropriate type
+ * via the bpf_snprintf_btf() helper described above. A flags field -
+ * potentially to specify additional details about the BTF pointer
+ * (rather than its mode of display) - is included for future use.
+ * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately.
+ */
+struct btf_ptr {
+ void *ptr;
+ __u32 type_id;
+ __u32 flags; /* BTF ptr flags; unused at present. */
+};
+
+/*
+ * Flags to control bpf_snprintf_btf() behaviour.
+ * - BTF_F_COMPACT: no formatting around type information
+ * - BTF_F_NONAME: no struct/union member names/types
+ * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values;
+ * equivalent to %px.
+ * - BTF_F_ZERO: show zero-valued struct/union members; they
+ * are not displayed by default
+ */
+enum {
+ BTF_F_COMPACT = (1ULL << 0),
+ BTF_F_NONAME = (1ULL << 1),
+ BTF_F_PTR_RAW = (1ULL << 2),
+ BTF_F_ZERO = (1ULL << 3),
+};
+
+/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value
+ * has to be adjusted by relocations. It is emitted by llvm and passed to
+ * libbpf and later to the kernel.
+ */
+enum bpf_core_relo_kind {
+ BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */
+ BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */
+ BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */
+ BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */
+ BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */
+ BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */
+ BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */
+ BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */
+ BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */
+ BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */
+ BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */
+ BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */
+ BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */
+};
+
+/*
+ * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf
+ * and from libbpf to the kernel.
+ *
+ * CO-RE relocation captures the following data:
+ * - insn_off - instruction offset (in bytes) within a BPF program that needs
+ * its insn->imm field to be relocated with actual field info;
+ * - type_id - BTF type ID of the "root" (containing) entity of a relocatable
+ * type or field;
+ * - access_str_off - offset into corresponding .BTF string section. String
+ * interpretation depends on specific relocation kind:
+ * - for field-based relocations, string encodes an accessed field using
+ * a sequence of field and array indices, separated by colon (:). It's
+ * conceptually very close to LLVM's getelementptr ([0]) instruction's
+ * arguments for identifying offset to a field.
+ * - for type-based relocations, strings is expected to be just "0";
+ * - for enum value-based relocations, string contains an index of enum
+ * value within its enum type;
+ * - kind - one of enum bpf_core_relo_kind;
+ *
+ * Example:
+ * struct sample {
+ * int a;
+ * struct {
+ * int b[10];
+ * };
+ * };
+ *
+ * struct sample *s = ...;
+ * int *x = &s->a; // encoded as "0:0" (a is field #0)
+ * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1,
+ * // b is field #0 inside anon struct, accessing elem #5)
+ * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)
+ *
+ * type_id for all relocs in this example will capture BTF type id of
+ * `struct sample`.
+ *
+ * Such relocation is emitted when using __builtin_preserve_access_index()
+ * Clang built-in, passing expression that captures field address, e.g.:
+ *
+ * bpf_probe_read(&dst, sizeof(dst),
+ * __builtin_preserve_access_index(&src->a.b.c));
+ *
+ * In this case Clang will emit field relocation recording necessary data to
+ * be able to find offset of embedded `a.b.c` field within `src` struct.
+ *
+ * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction
+ */
+struct bpf_core_relo {
+ __u32 insn_off;
+ __u32 type_id;
+ __u32 access_str_off;
+ enum bpf_core_relo_kind kind;
+};
+
+#endif /* __LINUX_BPF_H__ */
diff --git a/src/shared/linux/bpf_common.h b/src/shared/linux/bpf_common.h
new file mode 100644
index 0000000..f0fe139
--- /dev/null
+++ b/src/shared/linux/bpf_common.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __LINUX_BPF_COMMON_H__
+#define __LINUX_BPF_COMMON_H__
+
+/* Instruction classes */
+#define BPF_CLASS(code) ((code) & 0x07)
+#define BPF_LD 0x00
+#define BPF_LDX 0x01
+#define BPF_ST 0x02
+#define BPF_STX 0x03
+#define BPF_ALU 0x04
+#define BPF_JMP 0x05
+#define BPF_RET 0x06
+#define BPF_MISC 0x07
+
+/* ld/ldx fields */
+#define BPF_SIZE(code) ((code) & 0x18)
+#define BPF_W 0x00 /* 32-bit */
+#define BPF_H 0x08 /* 16-bit */
+#define BPF_B 0x10 /* 8-bit */
+/* eBPF BPF_DW 0x18 64-bit */
+#define BPF_MODE(code) ((code) & 0xe0)
+#define BPF_IMM 0x00
+#define BPF_ABS 0x20
+#define BPF_IND 0x40
+#define BPF_MEM 0x60
+#define BPF_LEN 0x80
+#define BPF_MSH 0xa0
+
+/* alu/jmp fields */
+#define BPF_OP(code) ((code) & 0xf0)
+#define BPF_ADD 0x00
+#define BPF_SUB 0x10
+#define BPF_MUL 0x20
+#define BPF_DIV 0x30
+#define BPF_OR 0x40
+#define BPF_AND 0x50
+#define BPF_LSH 0x60
+#define BPF_RSH 0x70
+#define BPF_NEG 0x80
+#define BPF_MOD 0x90
+#define BPF_XOR 0xa0
+
+#define BPF_JA 0x00
+#define BPF_JEQ 0x10
+#define BPF_JGT 0x20
+#define BPF_JGE 0x30
+#define BPF_JSET 0x40
+#define BPF_SRC(code) ((code) & 0x08)
+#define BPF_K 0x00
+#define BPF_X 0x08
+
+#ifndef BPF_MAXINSNS
+#define BPF_MAXINSNS 4096
+#endif
+
+#endif /* __LINUX_BPF_COMMON_H__ */
diff --git a/src/shared/linux/bpf_insn.h b/src/shared/linux/bpf_insn.h
new file mode 100644
index 0000000..92ec06b
--- /dev/null
+++ b/src/shared/linux/bpf_insn.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* eBPF instruction mini library */
+#ifndef __BPF_INSN_H
+#define __BPF_INSN_H
+
+struct bpf_insn;
+
+/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
+
+#define BPF_ALU64_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_ALU32_REG(OP, DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */
+
+#define BPF_ALU64_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Short form of mov, dst_reg = src_reg */
+
+#define BPF_MOV64_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+#define BPF_MOV32_REG(DST, SRC) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = 0 })
+
+/* Short form of mov, dst_reg = imm32 */
+
+#define BPF_MOV64_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU64 | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+#define BPF_MOV32_IMM(DST, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ALU | BPF_MOV | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
+#define BPF_LD_IMM64(DST, IMM) \
+ BPF_LD_IMM64_RAW(DST, 0, IMM)
+
+#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_DW | BPF_IMM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = 0, \
+ .imm = (__u32) (IMM) }), \
+ ((struct bpf_insn) { \
+ .code = 0, /* zero is reserved opcode */ \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = ((__u64) (IMM)) >> 32 })
+
+#ifndef BPF_PSEUDO_MAP_FD
+# define BPF_PSEUDO_MAP_FD 1
+#endif
+
+/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
+#define BPF_LD_MAP_FD(DST, MAP_FD) \
+ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
+
+
+/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */
+
+#define BPF_LD_ABS(SIZE, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = IMM })
+
+/* Memory load, dst_reg = *(uint *) (src_reg + off16) */
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
+
+#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/*
+ * Atomic operations:
+ *
+ * BPF_ADD *(uint *) (dst_reg + off16) += src_reg
+ * BPF_AND *(uint *) (dst_reg + off16) &= src_reg
+ * BPF_OR *(uint *) (dst_reg + off16) |= src_reg
+ * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg
+ * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
+ * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
+ * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
+ * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
+ * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg)
+ * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
+ */
+
+#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = OP })
+
+/* Legacy alias */
+#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)
+
+/* Memory store, *(uint *) (dst_reg + off16) = imm32 */
+
+#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */
+
+#define BPF_JMP32_REG(OP, DST, SRC, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */
+
+#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \
+ .dst_reg = DST, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = IMM })
+
+#define BPF_JMP_A(OFF) \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_JA, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = OFF, \
+ .imm = 0 })
+
+/* Raw code statement block */
+
+#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \
+ ((struct bpf_insn) { \
+ .code = CODE, \
+ .dst_reg = DST, \
+ .src_reg = SRC, \
+ .off = OFF, \
+ .imm = IMM })
+
+/* Program exit */
+
+#define BPF_EXIT_INSN() \
+ ((struct bpf_insn) { \
+ .code = BPF_JMP | BPF_EXIT, \
+ .dst_reg = 0, \
+ .src_reg = 0, \
+ .off = 0, \
+ .imm = 0 })
+
+#endif
diff --git a/src/shared/linux/dm-ioctl.h b/src/shared/linux/dm-ioctl.h
new file mode 100644
index 0000000..19a64fc
--- /dev/null
+++ b/src/shared/linux/dm-ioctl.h
@@ -0,0 +1,385 @@
+/* SPDX-License-Identifier: LGPL-2.0+ WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
+ * Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef _LINUX_DM_IOCTL_V4_H
+#define _LINUX_DM_IOCTL_V4_H
+
+#include <linux/types.h>
+
+#define DM_DIR "mapper" /* Slashes not supported */
+#define DM_CONTROL_NODE "control"
+#define DM_MAX_TYPE_NAME 16
+#define DM_NAME_LEN 128
+#define DM_UUID_LEN 129
+
+/*
+ * A traditional ioctl interface for the device mapper.
+ *
+ * Each device can have two tables associated with it, an
+ * 'active' table which is the one currently used by io passing
+ * through the device, and an 'inactive' one which is a table
+ * that is being prepared as a replacement for the 'active' one.
+ *
+ * DM_VERSION:
+ * Just get the version information for the ioctl interface.
+ *
+ * DM_REMOVE_ALL:
+ * Remove all dm devices, destroy all tables. Only really used
+ * for debug.
+ *
+ * DM_LIST_DEVICES:
+ * Get a list of all the dm device names.
+ *
+ * DM_DEV_CREATE:
+ * Create a new device, neither the 'active' or 'inactive' table
+ * slots will be filled. The device will be in suspended state
+ * after creation, however any io to the device will get errored
+ * since it will be out-of-bounds.
+ *
+ * DM_DEV_REMOVE:
+ * Remove a device, destroy any tables.
+ *
+ * DM_DEV_RENAME:
+ * Rename a device or set its uuid if none was previously supplied.
+ *
+ * DM_SUSPEND:
+ * This performs both suspend and resume, depending which flag is
+ * passed in.
+ * Suspend: This command will not return until all pending io to
+ * the device has completed. Further io will be deferred until
+ * the device is resumed.
+ * Resume: It is no longer an error to issue this command on an
+ * unsuspended device. If a table is present in the 'inactive'
+ * slot, it will be moved to the active slot, then the old table
+ * from the active slot will be _destroyed_. Finally the device
+ * is resumed.
+ *
+ * DM_DEV_STATUS:
+ * Retrieves the status for the table in the 'active' slot.
+ *
+ * DM_DEV_WAIT:
+ * Wait for a significant event to occur to the device. This
+ * could either be caused by an event triggered by one of the
+ * targets of the table in the 'active' slot, or a table change.
+ *
+ * DM_TABLE_LOAD:
+ * Load a table into the 'inactive' slot for the device. The
+ * device does _not_ need to be suspended prior to this command.
+ *
+ * DM_TABLE_CLEAR:
+ * Destroy any table in the 'inactive' slot (ie. abort).
+ *
+ * DM_TABLE_DEPS:
+ * Return a set of device dependencies for the 'active' table.
+ *
+ * DM_TABLE_STATUS:
+ * Return the targets status for the 'active' table.
+ *
+ * DM_TARGET_MSG:
+ * Pass a message string to the target at a specific offset of a device.
+ *
+ * DM_DEV_SET_GEOMETRY:
+ * Set the geometry of a device by passing in a string in this format:
+ *
+ * "cylinders heads sectors_per_track start_sector"
+ *
+ * Beware that CHS geometry is nearly obsolete and only provided
+ * for compatibility with dm devices that can be booted by a PC
+ * BIOS. See struct hd_geometry for range limits. Also note that
+ * the geometry is erased if the device size changes.
+ */
+
+/*
+ * All ioctl arguments consist of a single chunk of memory, with
+ * this structure at the start. If a uuid is specified any
+ * lookup (eg. for a DM_INFO) will be done on that, *not* the
+ * name.
+ */
+struct dm_ioctl {
+ /*
+ * The version number is made up of three parts:
+ * major - no backward or forward compatibility,
+ * minor - only backwards compatible,
+ * patch - both backwards and forwards compatible.
+ *
+ * All clients of the ioctl interface should fill in the
+ * version number of the interface that they were
+ * compiled with.
+ *
+ * All recognised ioctl commands (ie. those that don't
+ * return -ENOTTY) fill out this field, even if the
+ * command failed.
+ */
+ __u32 version[3]; /* in/out */
+ __u32 data_size; /* total size of data passed in
+ * including this struct */
+
+ __u32 data_start; /* offset to start of data
+ * relative to start of this struct */
+
+ __u32 target_count; /* in/out */
+ __s32 open_count; /* out */
+ __u32 flags; /* in/out */
+
+ /*
+ * event_nr holds either the event number (input and output) or the
+ * udev cookie value (input only).
+ * The DM_DEV_WAIT ioctl takes an event number as input.
+ * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+ * use the field as a cookie to return in the DM_COOKIE
+ * variable with the uevents they issue.
+ * For output, the ioctls return the event number, not the cookie.
+ */
+ __u32 event_nr; /* in/out */
+ __u32 padding;
+
+ __u64 dev; /* in/out */
+
+ char name[DM_NAME_LEN]; /* device name */
+ char uuid[DM_UUID_LEN]; /* unique identifier for
+ * the block device */
+ char data[7]; /* padding or data */
+};
+
+/*
+ * Used to specify tables. These structures appear after the
+ * dm_ioctl.
+ */
+struct dm_target_spec {
+ __u64 sector_start;
+ __u64 length;
+ __s32 status; /* used when reading from kernel only */
+
+ /*
+ * Location of the next dm_target_spec.
+ * - When specifying targets on a DM_TABLE_LOAD command, this value is
+ * the number of bytes from the start of the "current" dm_target_spec
+ * to the start of the "next" dm_target_spec.
+ * - When retrieving targets on a DM_TABLE_STATUS command, this value
+ * is the number of bytes from the start of the first dm_target_spec
+ * (that follows the dm_ioctl struct) to the start of the "next"
+ * dm_target_spec.
+ */
+ __u32 next;
+
+ char target_type[DM_MAX_TYPE_NAME];
+
+ /*
+ * Parameter string starts immediately after this object.
+ * Be careful to add padding after string to ensure correct
+ * alignment of subsequent dm_target_spec.
+ */
+};
+
+/*
+ * Used to retrieve the target dependencies.
+ */
+struct dm_target_deps {
+ __u32 count; /* Array size */
+ __u32 padding; /* unused */
+ __u64 dev[]; /* out */
+};
+
+/*
+ * Used to get a list of all dm devices.
+ */
+struct dm_name_list {
+ __u64 dev;
+ __u32 next; /* offset to the next record from
+ the _start_ of this */
+ char name[];
+
+ /*
+ * The following members can be accessed by taking a pointer that
+ * points immediately after the terminating zero character in "name"
+ * and aligning this pointer to next 8-byte boundary.
+ * Uuid is present if the flag DM_NAME_LIST_FLAG_HAS_UUID is set.
+ *
+ * __u32 event_nr;
+ * __u32 flags;
+ * char uuid[0];
+ */
+};
+
+#define DM_NAME_LIST_FLAG_HAS_UUID 1
+#define DM_NAME_LIST_FLAG_DOESNT_HAVE_UUID 2
+
+/*
+ * Used to retrieve the target versions
+ */
+struct dm_target_versions {
+ __u32 next;
+ __u32 version[3];
+
+ char name[];
+};
+
+/*
+ * Used to pass message to a target
+ */
+struct dm_target_msg {
+ __u64 sector; /* Device sector */
+
+ char message[];
+};
+
+/*
+ * If you change this make sure you make the corresponding change
+ * to dm-ioctl.c:lookup_ioctl()
+ */
+enum {
+ /* Top level cmds */
+ DM_VERSION_CMD = 0,
+ DM_REMOVE_ALL_CMD,
+ DM_LIST_DEVICES_CMD,
+
+ /* device level cmds */
+ DM_DEV_CREATE_CMD,
+ DM_DEV_REMOVE_CMD,
+ DM_DEV_RENAME_CMD,
+ DM_DEV_SUSPEND_CMD,
+ DM_DEV_STATUS_CMD,
+ DM_DEV_WAIT_CMD,
+
+ /* Table level cmds */
+ DM_TABLE_LOAD_CMD,
+ DM_TABLE_CLEAR_CMD,
+ DM_TABLE_DEPS_CMD,
+ DM_TABLE_STATUS_CMD,
+
+ /* Added later */
+ DM_LIST_VERSIONS_CMD,
+ DM_TARGET_MSG_CMD,
+ DM_DEV_SET_GEOMETRY_CMD,
+ DM_DEV_ARM_POLL_CMD,
+ DM_GET_TARGET_VERSION_CMD,
+};
+
+#define DM_IOCTL 0xfd
+
+#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
+#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
+#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
+
+#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
+#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
+#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
+#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
+#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
+#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
+#define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_ARM_POLL_CMD, struct dm_ioctl)
+
+#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
+#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
+#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
+#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
+
+#define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
+#define DM_GET_TARGET_VERSION _IOWR(DM_IOCTL, DM_GET_TARGET_VERSION_CMD, struct dm_ioctl)
+
+#define DM_TARGET_MSG _IOWR(DM_IOCTL, DM_TARGET_MSG_CMD, struct dm_ioctl)
+#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
+
+#define DM_VERSION_MAJOR 4
+#define DM_VERSION_MINOR 27
+#define DM_VERSION_PATCHLEVEL 0
+#define DM_VERSION_EXTRA "-ioctl (2022-02-22)"
+
+/* Status bits */
+#define DM_READONLY_FLAG (1 << 0) /* In/Out */
+#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */
+#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
+
+/*
+ * Flag passed into ioctl STATUS command to get table information
+ * rather than current status.
+ */
+#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */
+
+/*
+ * Flags that indicate whether a table is present in either of
+ * the two table slots that a device has.
+ */
+#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */
+#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
+
+/*
+ * Indicates that the buffer passed in wasn't big enough for the
+ * results.
+ */
+#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
+
+/*
+ * This flag is now ignored.
+ */
+#define DM_SKIP_BDGET_FLAG (1 << 9) /* In */
+
+/*
+ * Set this to avoid attempting to freeze any filesystem when suspending.
+ */
+#define DM_SKIP_LOCKFS_FLAG (1 << 10) /* In */
+
+/*
+ * Set this to suspend without flushing queued ios.
+ * Also disables flushing uncommitted changes in the thin target before
+ * generating statistics for DM_TABLE_STATUS and DM_DEV_WAIT.
+ */
+#define DM_NOFLUSH_FLAG (1 << 11) /* In */
+
+/*
+ * If set, any table information returned will relate to the inactive
+ * table instead of the live one. Always check DM_INACTIVE_PRESENT_FLAG
+ * is set before using the data returned.
+ */
+#define DM_QUERY_INACTIVE_TABLE_FLAG (1 << 12) /* In */
+
+/*
+ * If set, a uevent was generated for which the caller may need to wait.
+ */
+#define DM_UEVENT_GENERATED_FLAG (1 << 13) /* Out */
+
+/*
+ * If set, rename changes the uuid not the name. Only permitted
+ * if no uuid was previously supplied: an existing uuid cannot be changed.
+ */
+#define DM_UUID_FLAG (1 << 14) /* In */
+
+/*
+ * If set, all buffers are wiped after use. Use when sending
+ * or requesting sensitive data such as an encryption key.
+ */
+#define DM_SECURE_DATA_FLAG (1 << 15) /* In */
+
+/*
+ * If set, a message generated output data.
+ */
+#define DM_DATA_OUT_FLAG (1 << 16) /* Out */
+
+/*
+ * If set with DM_DEV_REMOVE or DM_REMOVE_ALL this indicates that if
+ * the device cannot be removed immediately because it is still in use
+ * it should instead be scheduled for removal when it gets closed.
+ *
+ * On return from DM_DEV_REMOVE, DM_DEV_STATUS or other ioctls, this
+ * flag indicates that the device is scheduled to be removed when it
+ * gets closed.
+ */
+#define DM_DEFERRED_REMOVE (1 << 17) /* In/Out */
+
+/*
+ * If set, the device is suspended internally.
+ */
+#define DM_INTERNAL_SUSPEND_FLAG (1 << 18) /* Out */
+
+/*
+ * If set, returns in the in buffer passed by UM, the raw table information
+ * that would be measured by IMA subsystem on device state change.
+ */
+#define DM_IMA_MEASUREMENT_FLAG (1 << 19) /* In */
+
+#endif /* _LINUX_DM_IOCTL_H */
diff --git a/src/shared/linux/ethtool.h b/src/shared/linux/ethtool.h
new file mode 100644
index 0000000..3d1da51
--- /dev/null
+++ b/src/shared/linux/ethtool.h
@@ -0,0 +1,2164 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * ethtool.h: Defines for Linux ethtool.
+ *
+ * Copyright (C) 1998 David S. Miller (davem@redhat.com)
+ * Copyright 2001 Jeff Garzik <jgarzik@pobox.com>
+ * Portions Copyright 2001 Sun Microsystems (thockin@sun.com)
+ * Portions Copyright 2002 Intel (eli.kupermann@intel.com,
+ * christopher.leech@intel.com,
+ * scott.feldman@intel.com)
+ * Portions Copyright (C) Sun Microsystems 2008
+ */
+
+#ifndef _LINUX_ETHTOOL_H
+#define _LINUX_ETHTOOL_H
+
+#include <linux/const.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include <limits.h> /* for INT_MAX */
+
+#ifndef __KERNEL_DIV_ROUND_UP
+#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#endif
+
+/* All structures exposed to userland should be defined such that they
+ * have the same layout for 32-bit and 64-bit userland.
+ */
+
+/* Note on reserved space.
+ * Reserved fields must not be accessed directly by user space because
+ * they may be replaced by a different field in the future. They must
+ * be initialized to zero before making the request, e.g. via memset
+ * of the entire structure or implicitly by not being set in a structure
+ * initializer.
+ */
+
+/**
+ * struct ethtool_cmd - DEPRECATED, link control and status
+ * This structure is DEPRECATED, please use struct ethtool_link_settings.
+ * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET
+ * @supported: Bitmask of %SUPPORTED_* flags for the link modes,
+ * physical connectors and other link features for which the
+ * interface supports autonegotiation or auto-detection.
+ * Read-only.
+ * @advertising: Bitmask of %ADVERTISED_* flags for the link modes,
+ * physical connectors and other link features that are
+ * advertised through autonegotiation or enabled for
+ * auto-detection.
+ * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
+ * @duplex: Duplex mode; one of %DUPLEX_*
+ * @port: Physical connector type; one of %PORT_*
+ * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
+ * applicable. For clause 45 PHYs this is the PRTAD.
+ * @transceiver: Historically used to distinguish different possible
+ * PHY types, but not in a consistent way. Deprecated.
+ * @autoneg: Enable/disable autonegotiation and auto-detection;
+ * either %AUTONEG_DISABLE or %AUTONEG_ENABLE
+ * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO
+ * protocols supported by the interface; 0 if unknown.
+ * Read-only.
+ * @maxtxpkt: Historically used to report TX IRQ coalescing; now
+ * obsoleted by &struct ethtool_coalesce. Read-only; deprecated.
+ * @maxrxpkt: Historically used to report RX IRQ coalescing; now
+ * obsoleted by &struct ethtool_coalesce. Read-only; deprecated.
+ * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
+ * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
+ * %ETH_TP_MDI_*. If the status is unknown or not applicable, the
+ * value will be %ETH_TP_MDI_INVALID. Read-only.
+ * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of
+ * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads
+ * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected.
+ * When written successfully, the link should be renegotiated if
+ * necessary.
+ * @lp_advertising: Bitmask of %ADVERTISED_* flags for the link modes
+ * and other link features that the link partner advertised
+ * through autonegotiation; 0 if unknown or not applicable.
+ * Read-only.
+ * @reserved: Reserved for future use; see the note on reserved space.
+ *
+ * The link speed in Mbps is split between @speed and @speed_hi. Use
+ * the ethtool_cmd_speed() and ethtool_cmd_speed_set() functions to
+ * access it.
+ *
+ * If autonegotiation is disabled, the speed and @duplex represent the
+ * fixed link mode and are writable if the driver supports multiple
+ * link modes. If it is enabled then they are read-only; if the link
+ * is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
+ * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
+ *
+ * Some hardware interfaces may have multiple PHYs and/or physical
+ * connectors fitted or do not allow the driver to detect which are
+ * fitted. For these interfaces @port and/or @phy_address may be
+ * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE.
+ * Otherwise, attempts to write different values may be ignored or
+ * rejected.
+ *
+ * Users should assume that all fields not marked read-only are
+ * writable and subject to validation by the driver. They should use
+ * %ETHTOOL_GSET to get the current values before making specific
+ * changes and then applying them with %ETHTOOL_SSET.
+ *
+ * Deprecated fields should be ignored by both users and drivers.
+ */
+struct ethtool_cmd {
+ __u32 cmd;
+ __u32 supported;
+ __u32 advertising;
+ __u16 speed;
+ __u8 duplex;
+ __u8 port;
+ __u8 phy_address;
+ __u8 transceiver;
+ __u8 autoneg;
+ __u8 mdio_support;
+ __u32 maxtxpkt;
+ __u32 maxrxpkt;
+ __u16 speed_hi;
+ __u8 eth_tp_mdix;
+ __u8 eth_tp_mdix_ctrl;
+ __u32 lp_advertising;
+ __u32 reserved[2];
+};
+
+static __inline__ void ethtool_cmd_speed_set(struct ethtool_cmd *ep,
+ __u32 speed)
+{
+ ep->speed = (__u16)(speed & 0xFFFF);
+ ep->speed_hi = (__u16)(speed >> 16);
+}
+
+static __inline__ __u32 ethtool_cmd_speed(const struct ethtool_cmd *ep)
+{
+ return ((__u32) ep->speed_hi << 16) | (__u32) ep->speed;
+}
+
+/* Device supports clause 22 register access to PHY or peripherals
+ * using the interface defined in <linux/mii.h>. This should not be
+ * set if there are known to be no such peripherals present or if
+ * the driver only emulates clause 22 registers for compatibility.
+ */
+#define ETH_MDIO_SUPPORTS_C22 1
+
+/* Device supports clause 45 register access to PHY or peripherals
+ * using the interface defined in <linux/mii.h> and <linux/mdio.h>.
+ * This should not be set if there are known to be no such peripherals
+ * present.
+ */
+#define ETH_MDIO_SUPPORTS_C45 2
+
+#define ETHTOOL_FWVERS_LEN 32
+#define ETHTOOL_BUSINFO_LEN 32
+#define ETHTOOL_EROMVERS_LEN 32
+
+/**
+ * struct ethtool_drvinfo - general driver and device information
+ * @cmd: Command number = %ETHTOOL_GDRVINFO
+ * @driver: Driver short name. This should normally match the name
+ * in its bus driver structure (e.g. pci_driver::name). Must
+ * not be an empty string.
+ * @version: Driver version string; may be an empty string
+ * @fw_version: Firmware version string; driver defined; may be an
+ * empty string
+ * @erom_version: Expansion ROM version string; driver defined; may be
+ * an empty string
+ * @bus_info: Device bus address. This should match the dev_name()
+ * string for the underlying bus device, if there is one. May be
+ * an empty string.
+ * @reserved2: Reserved for future use; see the note on reserved space.
+ * @n_priv_flags: Number of flags valid for %ETHTOOL_GPFLAGS and
+ * %ETHTOOL_SPFLAGS commands; also the number of strings in the
+ * %ETH_SS_PRIV_FLAGS set
+ * @n_stats: Number of u64 statistics returned by the %ETHTOOL_GSTATS
+ * command; also the number of strings in the %ETH_SS_STATS set
+ * @testinfo_len: Number of results returned by the %ETHTOOL_TEST
+ * command; also the number of strings in the %ETH_SS_TEST set
+ * @eedump_len: Size of EEPROM accessible through the %ETHTOOL_GEEPROM
+ * and %ETHTOOL_SEEPROM commands, in bytes
+ * @regdump_len: Size of register dump returned by the %ETHTOOL_GREGS
+ * command, in bytes
+ *
+ * Users can use the %ETHTOOL_GSSET_INFO command to get the number of
+ * strings in any string set (from Linux 2.6.34).
+ */
+struct ethtool_drvinfo {
+ __u32 cmd;
+ char driver[32];
+ char version[32];
+ char fw_version[ETHTOOL_FWVERS_LEN];
+ char bus_info[ETHTOOL_BUSINFO_LEN];
+ char erom_version[ETHTOOL_EROMVERS_LEN];
+ char reserved2[12];
+ __u32 n_priv_flags;
+ __u32 n_stats;
+ __u32 testinfo_len;
+ __u32 eedump_len;
+ __u32 regdump_len;
+};
+
+#define SOPASS_MAX 6
+
+/**
+ * struct ethtool_wolinfo - Wake-On-Lan configuration
+ * @cmd: Command number = %ETHTOOL_GWOL or %ETHTOOL_SWOL
+ * @supported: Bitmask of %WAKE_* flags for supported Wake-On-Lan modes.
+ * Read-only.
+ * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes.
+ * @sopass: SecureOn(tm) password; meaningful only if %WAKE_MAGICSECURE
+ * is set in @wolopts.
+ */
+struct ethtool_wolinfo {
+ __u32 cmd;
+ __u32 supported;
+ __u32 wolopts;
+ __u8 sopass[SOPASS_MAX];
+};
+
+/* for passing single values */
+struct ethtool_value {
+ __u32 cmd;
+ __u32 data;
+};
+
+#define PFC_STORM_PREVENTION_AUTO 0xffff
+#define PFC_STORM_PREVENTION_DISABLE 0
+
+enum tunable_id {
+ ETHTOOL_ID_UNSPEC,
+ ETHTOOL_RX_COPYBREAK,
+ ETHTOOL_TX_COPYBREAK,
+ ETHTOOL_PFC_PREVENTION_TOUT, /* timeout in msecs */
+ ETHTOOL_TX_COPYBREAK_BUF_SIZE,
+ /*
+ * Add your fresh new tunable attribute above and remember to update
+ * tunable_strings[] in net/ethtool/common.c
+ */
+ __ETHTOOL_TUNABLE_COUNT,
+};
+
+enum tunable_type_id {
+ ETHTOOL_TUNABLE_UNSPEC,
+ ETHTOOL_TUNABLE_U8,
+ ETHTOOL_TUNABLE_U16,
+ ETHTOOL_TUNABLE_U32,
+ ETHTOOL_TUNABLE_U64,
+ ETHTOOL_TUNABLE_STRING,
+ ETHTOOL_TUNABLE_S8,
+ ETHTOOL_TUNABLE_S16,
+ ETHTOOL_TUNABLE_S32,
+ ETHTOOL_TUNABLE_S64,
+};
+
+struct ethtool_tunable {
+ __u32 cmd;
+ __u32 id;
+ __u32 type_id;
+ __u32 len;
+ void *data[];
+};
+
+#define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff
+#define DOWNSHIFT_DEV_DISABLE 0
+
+/* Time in msecs after which link is reported as down
+ * 0 = lowest time supported by the PHY
+ * 0xff = off, link down detection according to standard
+ */
+#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0
+#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff
+
+/* Energy Detect Power Down (EDPD) is a feature supported by some PHYs, where
+ * the PHY's RX & TX blocks are put into a low-power mode when there is no
+ * link detected (typically cable is un-plugged). For RX, only a minimal
+ * link-detection is available, and for TX the PHY wakes up to send link pulses
+ * to avoid any lock-ups in case the peer PHY may also be running in EDPD mode.
+ *
+ * Some PHYs may support configuration of the wake-up interval for TX pulses,
+ * and some PHYs may support only disabling TX pulses entirely. For the latter
+ * a special value is required (ETHTOOL_PHY_EDPD_NO_TX) so that this can be
+ * configured from userspace (should the user want it).
+ *
+ * The interval units for TX wake-up are in milliseconds, since this should
+ * cover a reasonable range of intervals:
+ * - from 1 millisecond, which does not sound like much of a power-saver
+ * - to ~65 seconds which is quite a lot to wait for a link to come up when
+ * plugging a cable
+ */
+#define ETHTOOL_PHY_EDPD_DFLT_TX_MSECS 0xffff
+#define ETHTOOL_PHY_EDPD_NO_TX 0xfffe
+#define ETHTOOL_PHY_EDPD_DISABLE 0
+
+enum phy_tunable_id {
+ ETHTOOL_PHY_ID_UNSPEC,
+ ETHTOOL_PHY_DOWNSHIFT,
+ ETHTOOL_PHY_FAST_LINK_DOWN,
+ ETHTOOL_PHY_EDPD,
+ /*
+ * Add your fresh new phy tunable attribute above and remember to update
+ * phy_tunable_strings[] in net/ethtool/common.c
+ */
+ __ETHTOOL_PHY_TUNABLE_COUNT,
+};
+
+/**
+ * struct ethtool_regs - hardware register dump
+ * @cmd: Command number = %ETHTOOL_GREGS
+ * @version: Dump format version. This is driver-specific and may
+ * distinguish different chips/revisions. Drivers must use new
+ * version numbers whenever the dump format changes in an
+ * incompatible way.
+ * @len: On entry, the real length of @data. On return, the number of
+ * bytes used.
+ * @data: Buffer for the register dump
+ *
+ * Users should use %ETHTOOL_GDRVINFO to find the maximum length of
+ * a register dump for the interface. They must allocate the buffer
+ * immediately following this structure.
+ */
+struct ethtool_regs {
+ __u32 cmd;
+ __u32 version;
+ __u32 len;
+ __u8 data[];
+};
+
+/**
+ * struct ethtool_eeprom - EEPROM dump
+ * @cmd: Command number = %ETHTOOL_GEEPROM, %ETHTOOL_GMODULEEEPROM or
+ * %ETHTOOL_SEEPROM
+ * @magic: A 'magic cookie' value to guard against accidental changes.
+ * The value passed in to %ETHTOOL_SEEPROM must match the value
+ * returned by %ETHTOOL_GEEPROM for the same device. This is
+ * unused when @cmd is %ETHTOOL_GMODULEEEPROM.
+ * @offset: Offset within the EEPROM to begin reading/writing, in bytes
+ * @len: On entry, number of bytes to read/write. On successful
+ * return, number of bytes actually read/written. In case of
+ * error, this may indicate at what point the error occurred.
+ * @data: Buffer to read/write from
+ *
+ * Users may use %ETHTOOL_GDRVINFO or %ETHTOOL_GMODULEINFO to find
+ * the length of an on-board or module EEPROM, respectively. They
+ * must allocate the buffer immediately following this structure.
+ */
+struct ethtool_eeprom {
+ __u32 cmd;
+ __u32 magic;
+ __u32 offset;
+ __u32 len;
+ __u8 data[];
+};
+
+/**
+ * struct ethtool_eee - Energy Efficient Ethernet information
+ * @cmd: ETHTOOL_{G,S}EEE
+ * @supported: Mask of %SUPPORTED_* flags for the speed/duplex combinations
+ * for which there is EEE support.
+ * @advertised: Mask of %ADVERTISED_* flags for the speed/duplex combinations
+ * advertised as eee capable.
+ * @lp_advertised: Mask of %ADVERTISED_* flags for the speed/duplex
+ * combinations advertised by the link partner as eee capable.
+ * @eee_active: Result of the eee auto negotiation.
+ * @eee_enabled: EEE configured mode (enabled/disabled).
+ * @tx_lpi_enabled: Whether the interface should assert its tx lpi, given
+ * that eee was negotiated.
+ * @tx_lpi_timer: Time in microseconds the interface delays prior to asserting
+ * its tx lpi (after reaching 'idle' state). Effective only when eee
+ * was negotiated and tx_lpi_enabled was set.
+ * @reserved: Reserved for future use; see the note on reserved space.
+ */
+struct ethtool_eee {
+ __u32 cmd;
+ __u32 supported;
+ __u32 advertised;
+ __u32 lp_advertised;
+ __u32 eee_active;
+ __u32 eee_enabled;
+ __u32 tx_lpi_enabled;
+ __u32 tx_lpi_timer;
+ __u32 reserved[2];
+};
+
+/**
+ * struct ethtool_modinfo - plugin module eeprom information
+ * @cmd: %ETHTOOL_GMODULEINFO
+ * @type: Standard the module information conforms to %ETH_MODULE_SFF_xxxx
+ * @eeprom_len: Length of the eeprom
+ * @reserved: Reserved for future use; see the note on reserved space.
+ *
+ * This structure is used to return the information to
+ * properly size memory for a subsequent call to %ETHTOOL_GMODULEEEPROM.
+ * The type code indicates the eeprom data format
+ */
+struct ethtool_modinfo {
+ __u32 cmd;
+ __u32 type;
+ __u32 eeprom_len;
+ __u32 reserved[8];
+};
+
+/**
+ * struct ethtool_coalesce - coalescing parameters for IRQs and stats updates
+ * @cmd: ETHTOOL_{G,S}COALESCE
+ * @rx_coalesce_usecs: How many usecs to delay an RX interrupt after
+ * a packet arrives.
+ * @rx_max_coalesced_frames: Maximum number of packets to receive
+ * before an RX interrupt.
+ * @rx_coalesce_usecs_irq: Same as @rx_coalesce_usecs, except that
+ * this value applies while an IRQ is being serviced by the host.
+ * @rx_max_coalesced_frames_irq: Same as @rx_max_coalesced_frames,
+ * except that this value applies while an IRQ is being serviced
+ * by the host.
+ * @tx_coalesce_usecs: How many usecs to delay a TX interrupt after
+ * a packet is sent.
+ * @tx_max_coalesced_frames: Maximum number of packets to be sent
+ * before a TX interrupt.
+ * @tx_coalesce_usecs_irq: Same as @tx_coalesce_usecs, except that
+ * this value applies while an IRQ is being serviced by the host.
+ * @tx_max_coalesced_frames_irq: Same as @tx_max_coalesced_frames,
+ * except that this value applies while an IRQ is being serviced
+ * by the host.
+ * @stats_block_coalesce_usecs: How many usecs to delay in-memory
+ * statistics block updates. Some drivers do not have an
+ * in-memory statistic block, and in such cases this value is
+ * ignored. This value must not be zero.
+ * @use_adaptive_rx_coalesce: Enable adaptive RX coalescing.
+ * @use_adaptive_tx_coalesce: Enable adaptive TX coalescing.
+ * @pkt_rate_low: Threshold for low packet rate (packets per second).
+ * @rx_coalesce_usecs_low: How many usecs to delay an RX interrupt after
+ * a packet arrives, when the packet rate is below @pkt_rate_low.
+ * @rx_max_coalesced_frames_low: Maximum number of packets to be received
+ * before an RX interrupt, when the packet rate is below @pkt_rate_low.
+ * @tx_coalesce_usecs_low: How many usecs to delay a TX interrupt after
+ * a packet is sent, when the packet rate is below @pkt_rate_low.
+ * @tx_max_coalesced_frames_low: Maximum nuumber of packets to be sent before
+ * a TX interrupt, when the packet rate is below @pkt_rate_low.
+ * @pkt_rate_high: Threshold for high packet rate (packets per second).
+ * @rx_coalesce_usecs_high: How many usecs to delay an RX interrupt after
+ * a packet arrives, when the packet rate is above @pkt_rate_high.
+ * @rx_max_coalesced_frames_high: Maximum number of packets to be received
+ * before an RX interrupt, when the packet rate is above @pkt_rate_high.
+ * @tx_coalesce_usecs_high: How many usecs to delay a TX interrupt after
+ * a packet is sent, when the packet rate is above @pkt_rate_high.
+ * @tx_max_coalesced_frames_high: Maximum number of packets to be sent before
+ * a TX interrupt, when the packet rate is above @pkt_rate_high.
+ * @rate_sample_interval: How often to do adaptive coalescing packet rate
+ * sampling, measured in seconds. Must not be zero.
+ *
+ * Each pair of (usecs, max_frames) fields specifies that interrupts
+ * should be coalesced until
+ * (usecs > 0 && time_since_first_completion >= usecs) ||
+ * (max_frames > 0 && completed_frames >= max_frames)
+ *
+ * It is illegal to set both usecs and max_frames to zero as this
+ * would cause interrupts to never be generated. To disable
+ * coalescing, set usecs = 0 and max_frames = 1.
+ *
+ * Some implementations ignore the value of max_frames and use the
+ * condition time_since_first_completion >= usecs
+ *
+ * This is deprecated. Drivers for hardware that does not support
+ * counting completions should validate that max_frames == !rx_usecs.
+ *
+ * Adaptive RX/TX coalescing is an algorithm implemented by some
+ * drivers to improve latency under low packet rates and improve
+ * throughput under high packet rates. Some drivers only implement
+ * one of RX or TX adaptive coalescing. Anything not implemented by
+ * the driver causes these values to be silently ignored.
+ *
+ * When the packet rate is below @pkt_rate_high but above
+ * @pkt_rate_low (both measured in packets per second) the
+ * normal {rx,tx}_* coalescing parameters are used.
+ */
+struct ethtool_coalesce {
+ __u32 cmd;
+ __u32 rx_coalesce_usecs;
+ __u32 rx_max_coalesced_frames;
+ __u32 rx_coalesce_usecs_irq;
+ __u32 rx_max_coalesced_frames_irq;
+ __u32 tx_coalesce_usecs;
+ __u32 tx_max_coalesced_frames;
+ __u32 tx_coalesce_usecs_irq;
+ __u32 tx_max_coalesced_frames_irq;
+ __u32 stats_block_coalesce_usecs;
+ __u32 use_adaptive_rx_coalesce;
+ __u32 use_adaptive_tx_coalesce;
+ __u32 pkt_rate_low;
+ __u32 rx_coalesce_usecs_low;
+ __u32 rx_max_coalesced_frames_low;
+ __u32 tx_coalesce_usecs_low;
+ __u32 tx_max_coalesced_frames_low;
+ __u32 pkt_rate_high;
+ __u32 rx_coalesce_usecs_high;
+ __u32 rx_max_coalesced_frames_high;
+ __u32 tx_coalesce_usecs_high;
+ __u32 tx_max_coalesced_frames_high;
+ __u32 rate_sample_interval;
+};
+
+/**
+ * struct ethtool_ringparam - RX/TX ring parameters
+ * @cmd: Command number = %ETHTOOL_GRINGPARAM or %ETHTOOL_SRINGPARAM
+ * @rx_max_pending: Maximum supported number of pending entries per
+ * RX ring. Read-only.
+ * @rx_mini_max_pending: Maximum supported number of pending entries
+ * per RX mini ring. Read-only.
+ * @rx_jumbo_max_pending: Maximum supported number of pending entries
+ * per RX jumbo ring. Read-only.
+ * @tx_max_pending: Maximum supported number of pending entries per
+ * TX ring. Read-only.
+ * @rx_pending: Current maximum number of pending entries per RX ring
+ * @rx_mini_pending: Current maximum number of pending entries per RX
+ * mini ring
+ * @rx_jumbo_pending: Current maximum number of pending entries per RX
+ * jumbo ring
+ * @tx_pending: Current maximum supported number of pending entries
+ * per TX ring
+ *
+ * If the interface does not have separate RX mini and/or jumbo rings,
+ * @rx_mini_max_pending and/or @rx_jumbo_max_pending will be 0.
+ *
+ * There may also be driver-dependent minimum values for the number
+ * of entries per ring.
+ */
+struct ethtool_ringparam {
+ __u32 cmd;
+ __u32 rx_max_pending;
+ __u32 rx_mini_max_pending;
+ __u32 rx_jumbo_max_pending;
+ __u32 tx_max_pending;
+ __u32 rx_pending;
+ __u32 rx_mini_pending;
+ __u32 rx_jumbo_pending;
+ __u32 tx_pending;
+};
+
+/**
+ * struct ethtool_channels - configuring number of network channel
+ * @cmd: ETHTOOL_{G,S}CHANNELS
+ * @max_rx: Read only. Maximum number of receive channel the driver support.
+ * @max_tx: Read only. Maximum number of transmit channel the driver support.
+ * @max_other: Read only. Maximum number of other channel the driver support.
+ * @max_combined: Read only. Maximum number of combined channel the driver
+ * support. Set of queues RX, TX or other.
+ * @rx_count: Valid values are in the range 1 to the max_rx.
+ * @tx_count: Valid values are in the range 1 to the max_tx.
+ * @other_count: Valid values are in the range 1 to the max_other.
+ * @combined_count: Valid values are in the range 1 to the max_combined.
+ *
+ * This can be used to configure RX, TX and other channels.
+ */
+
+struct ethtool_channels {
+ __u32 cmd;
+ __u32 max_rx;
+ __u32 max_tx;
+ __u32 max_other;
+ __u32 max_combined;
+ __u32 rx_count;
+ __u32 tx_count;
+ __u32 other_count;
+ __u32 combined_count;
+};
+
+/**
+ * struct ethtool_pauseparam - Ethernet pause (flow control) parameters
+ * @cmd: Command number = %ETHTOOL_GPAUSEPARAM or %ETHTOOL_SPAUSEPARAM
+ * @autoneg: Flag to enable autonegotiation of pause frame use
+ * @rx_pause: Flag to enable reception of pause frames
+ * @tx_pause: Flag to enable transmission of pause frames
+ *
+ * Drivers should reject a non-zero setting of @autoneg when
+ * autoneogotiation is disabled (or not supported) for the link.
+ *
+ * If the link is autonegotiated, drivers should use
+ * mii_advertise_flowctrl() or similar code to set the advertised
+ * pause frame capabilities based on the @rx_pause and @tx_pause flags,
+ * even if @autoneg is zero. They should also allow the advertised
+ * pause frame capabilities to be controlled directly through the
+ * advertising field of &struct ethtool_cmd.
+ *
+ * If @autoneg is non-zero, the MAC is configured to send and/or
+ * receive pause frames according to the result of autonegotiation.
+ * Otherwise, it is configured directly based on the @rx_pause and
+ * @tx_pause flags.
+ */
+struct ethtool_pauseparam {
+ __u32 cmd;
+ __u32 autoneg;
+ __u32 rx_pause;
+ __u32 tx_pause;
+};
+
+/* Link extended state */
+enum ethtool_link_ext_state {
+ ETHTOOL_LINK_EXT_STATE_AUTONEG,
+ ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE,
+ ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH,
+ ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY,
+ ETHTOOL_LINK_EXT_STATE_NO_CABLE,
+ ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE,
+ ETHTOOL_LINK_EXT_STATE_EEPROM_ISSUE,
+ ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE,
+ ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED,
+ ETHTOOL_LINK_EXT_STATE_OVERHEAT,
+ ETHTOOL_LINK_EXT_STATE_MODULE,
+};
+
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */
+enum ethtool_link_ext_substate_autoneg {
+ ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED = 1,
+ ETHTOOL_LINK_EXT_SUBSTATE_AN_ACK_NOT_RECEIVED,
+ ETHTOOL_LINK_EXT_SUBSTATE_AN_NEXT_PAGE_EXCHANGE_FAILED,
+ ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_PARTNER_DETECTED_FORCE_MODE,
+ ETHTOOL_LINK_EXT_SUBSTATE_AN_FEC_MISMATCH_DURING_OVERRIDE,
+ ETHTOOL_LINK_EXT_SUBSTATE_AN_NO_HCD,
+};
+
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_TRAINING_FAILURE.
+ */
+enum ethtool_link_ext_substate_link_training {
+ ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_FRAME_LOCK_NOT_ACQUIRED = 1,
+ ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_INHIBIT_TIMEOUT,
+ ETHTOOL_LINK_EXT_SUBSTATE_LT_KR_LINK_PARTNER_DID_NOT_SET_RECEIVER_READY,
+ ETHTOOL_LINK_EXT_SUBSTATE_LT_REMOTE_FAULT,
+};
+
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_LINK_LOGICAL_MISMATCH.
+ */
+enum ethtool_link_ext_substate_link_logical_mismatch {
+ ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_BLOCK_LOCK = 1,
+ ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_ACQUIRE_AM_LOCK,
+ ETHTOOL_LINK_EXT_SUBSTATE_LLM_PCS_DID_NOT_GET_ALIGN_STATUS,
+ ETHTOOL_LINK_EXT_SUBSTATE_LLM_FC_FEC_IS_NOT_LOCKED,
+ ETHTOOL_LINK_EXT_SUBSTATE_LLM_RS_FEC_IS_NOT_LOCKED,
+};
+
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_BAD_SIGNAL_INTEGRITY.
+ */
+enum ethtool_link_ext_substate_bad_signal_integrity {
+ ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1,
+ ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE,
+ ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST,
+ ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS,
+};
+
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */
+enum ethtool_link_ext_substate_cable_issue {
+ ETHTOOL_LINK_EXT_SUBSTATE_CI_UNSUPPORTED_CABLE = 1,
+ ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE,
+};
+
+/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */
+enum ethtool_link_ext_substate_module {
+ ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1,
+};
+
+#define ETH_GSTRING_LEN 32
+
+/**
+ * enum ethtool_stringset - string set ID
+ * @ETH_SS_TEST: Self-test result names, for use with %ETHTOOL_TEST
+ * @ETH_SS_STATS: Statistic names, for use with %ETHTOOL_GSTATS
+ * @ETH_SS_PRIV_FLAGS: Driver private flag names, for use with
+ * %ETHTOOL_GPFLAGS and %ETHTOOL_SPFLAGS
+ * @ETH_SS_NTUPLE_FILTERS: Previously used with %ETHTOOL_GRXNTUPLE;
+ * now deprecated
+ * @ETH_SS_FEATURES: Device feature names
+ * @ETH_SS_RSS_HASH_FUNCS: RSS hush function names
+ * @ETH_SS_TUNABLES: tunable names
+ * @ETH_SS_PHY_STATS: Statistic names, for use with %ETHTOOL_GPHYSTATS
+ * @ETH_SS_PHY_TUNABLES: PHY tunable names
+ * @ETH_SS_LINK_MODES: link mode names
+ * @ETH_SS_MSG_CLASSES: debug message class names
+ * @ETH_SS_WOL_MODES: wake-on-lan modes
+ * @ETH_SS_SOF_TIMESTAMPING: SOF_TIMESTAMPING_* flags
+ * @ETH_SS_TS_TX_TYPES: timestamping Tx types
+ * @ETH_SS_TS_RX_FILTERS: timestamping Rx filters
+ * @ETH_SS_UDP_TUNNEL_TYPES: UDP tunnel types
+ * @ETH_SS_STATS_STD: standardized stats
+ * @ETH_SS_STATS_ETH_PHY: names of IEEE 802.3 PHY statistics
+ * @ETH_SS_STATS_ETH_MAC: names of IEEE 802.3 MAC statistics
+ * @ETH_SS_STATS_ETH_CTRL: names of IEEE 802.3 MAC Control statistics
+ * @ETH_SS_STATS_RMON: names of RMON statistics
+ *
+ * @ETH_SS_COUNT: number of defined string sets
+ */
+enum ethtool_stringset {
+ ETH_SS_TEST = 0,
+ ETH_SS_STATS,
+ ETH_SS_PRIV_FLAGS,
+ ETH_SS_NTUPLE_FILTERS,
+ ETH_SS_FEATURES,
+ ETH_SS_RSS_HASH_FUNCS,
+ ETH_SS_TUNABLES,
+ ETH_SS_PHY_STATS,
+ ETH_SS_PHY_TUNABLES,
+ ETH_SS_LINK_MODES,
+ ETH_SS_MSG_CLASSES,
+ ETH_SS_WOL_MODES,
+ ETH_SS_SOF_TIMESTAMPING,
+ ETH_SS_TS_TX_TYPES,
+ ETH_SS_TS_RX_FILTERS,
+ ETH_SS_UDP_TUNNEL_TYPES,
+ ETH_SS_STATS_STD,
+ ETH_SS_STATS_ETH_PHY,
+ ETH_SS_STATS_ETH_MAC,
+ ETH_SS_STATS_ETH_CTRL,
+ ETH_SS_STATS_RMON,
+
+ /* add new constants above here */
+ ETH_SS_COUNT
+};
+
+/**
+ * enum ethtool_module_power_mode_policy - plug-in module power mode policy
+ * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode.
+ * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host
+ * to high power mode when the first port using it is put administratively
+ * up and to low power mode when the last port using it is put
+ * administratively down.
+ */
+enum ethtool_module_power_mode_policy {
+ ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1,
+ ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO,
+};
+
+/**
+ * enum ethtool_module_power_mode - plug-in module power mode
+ * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode.
+ * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode.
+ */
+enum ethtool_module_power_mode {
+ ETHTOOL_MODULE_POWER_MODE_LOW = 1,
+ ETHTOOL_MODULE_POWER_MODE_HIGH,
+};
+
+/**
+ * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE
+ * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are
+ * unknown
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled
+ */
+enum ethtool_podl_pse_admin_state {
+ ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1,
+ ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+ ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED,
+};
+
+/**
+ * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE.
+ * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is
+ * asserted true when the PoDL PSE state diagram variable mr_pse_enable is
+ * false"
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is
+ * asserted true when either of the PSE state diagram variables
+ * pi_detecting or pi_classifying is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower”
+ * is asserted true when the PoDL PSE state diagram variable pi_powered is
+ * true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted
+ * true when the PoDL PSE state diagram variable pi_sleeping is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true
+ * when the logical combination of the PoDL PSE state diagram variables
+ * pi_prebiased*!pi_sleeping is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted
+ * true when the PoDL PSE state diagram variable overload_held is true."
+ */
+enum ethtool_podl_pse_pw_d_status {
+ ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1,
+ ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED,
+ ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING,
+ ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING,
+ ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP,
+ ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE,
+ ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR,
+};
+
+/**
+ * struct ethtool_gstrings - string set for data tagging
+ * @cmd: Command number = %ETHTOOL_GSTRINGS
+ * @string_set: String set ID; one of &enum ethtool_stringset
+ * @len: On return, the number of strings in the string set
+ * @data: Buffer for strings. Each string is null-padded to a size of
+ * %ETH_GSTRING_LEN.
+ *
+ * Users must use %ETHTOOL_GSSET_INFO to find the number of strings in
+ * the string set. They must allocate a buffer of the appropriate
+ * size immediately following this structure.
+ */
+struct ethtool_gstrings {
+ __u32 cmd;
+ __u32 string_set;
+ __u32 len;
+ __u8 data[];
+};
+
+/**
+ * struct ethtool_sset_info - string set information
+ * @cmd: Command number = %ETHTOOL_GSSET_INFO
+ * @reserved: Reserved for future use; see the note on reserved space.
+ * @sset_mask: On entry, a bitmask of string sets to query, with bits
+ * numbered according to &enum ethtool_stringset. On return, a
+ * bitmask of those string sets queried that are supported.
+ * @data: Buffer for string set sizes. On return, this contains the
+ * size of each string set that was queried and supported, in
+ * order of ID.
+ *
+ * Example: The user passes in @sset_mask = 0x7 (sets 0, 1, 2) and on
+ * return @sset_mask == 0x6 (sets 1, 2). Then @data[0] contains the
+ * size of set 1 and @data[1] contains the size of set 2.
+ *
+ * Users must allocate a buffer of the appropriate size (4 * number of
+ * sets queried) immediately following this structure.
+ */
+struct ethtool_sset_info {
+ __u32 cmd;
+ __u32 reserved;
+ __u64 sset_mask;
+ __u32 data[];
+};
+
+/**
+ * enum ethtool_test_flags - flags definition of ethtool_test
+ * @ETH_TEST_FL_OFFLINE: if set perform online and offline tests, otherwise
+ * only online tests.
+ * @ETH_TEST_FL_FAILED: Driver set this flag if test fails.
+ * @ETH_TEST_FL_EXTERNAL_LB: Application request to perform external loopback
+ * test.
+ * @ETH_TEST_FL_EXTERNAL_LB_DONE: Driver performed the external loopback test
+ */
+
+enum ethtool_test_flags {
+ ETH_TEST_FL_OFFLINE = (1 << 0),
+ ETH_TEST_FL_FAILED = (1 << 1),
+ ETH_TEST_FL_EXTERNAL_LB = (1 << 2),
+ ETH_TEST_FL_EXTERNAL_LB_DONE = (1 << 3),
+};
+
+/**
+ * struct ethtool_test - device self-test invocation
+ * @cmd: Command number = %ETHTOOL_TEST
+ * @flags: A bitmask of flags from &enum ethtool_test_flags. Some
+ * flags may be set by the user on entry; others may be set by
+ * the driver on return.
+ * @reserved: Reserved for future use; see the note on reserved space.
+ * @len: On return, the number of test results
+ * @data: Array of test results
+ *
+ * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the
+ * number of test results that will be returned. They must allocate a
+ * buffer of the appropriate size (8 * number of results) immediately
+ * following this structure.
+ */
+struct ethtool_test {
+ __u32 cmd;
+ __u32 flags;
+ __u32 reserved;
+ __u32 len;
+ __u64 data[];
+};
+
+/**
+ * struct ethtool_stats - device-specific statistics
+ * @cmd: Command number = %ETHTOOL_GSTATS
+ * @n_stats: On return, the number of statistics
+ * @data: Array of statistics
+ *
+ * Users must use %ETHTOOL_GSSET_INFO or %ETHTOOL_GDRVINFO to find the
+ * number of statistics that will be returned. They must allocate a
+ * buffer of the appropriate size (8 * number of statistics)
+ * immediately following this structure.
+ */
+struct ethtool_stats {
+ __u32 cmd;
+ __u32 n_stats;
+ __u64 data[];
+};
+
+/**
+ * struct ethtool_perm_addr - permanent hardware address
+ * @cmd: Command number = %ETHTOOL_GPERMADDR
+ * @size: On entry, the size of the buffer. On return, the size of the
+ * address. The command fails if the buffer is too small.
+ * @data: Buffer for the address
+ *
+ * Users must allocate the buffer immediately following this structure.
+ * A buffer size of %MAX_ADDR_LEN should be sufficient for any address
+ * type.
+ */
+struct ethtool_perm_addr {
+ __u32 cmd;
+ __u32 size;
+ __u8 data[];
+};
+
+/* boolean flags controlling per-interface behavior characteristics.
+ * When reading, the flag indicates whether or not a certain behavior
+ * is enabled/present. When writing, the flag indicates whether
+ * or not the driver should turn on (set) or off (clear) a behavior.
+ *
+ * Some behaviors may read-only (unconditionally absent or present).
+ * If such is the case, return EINVAL in the set-flags operation if the
+ * flag differs from the read-only value.
+ */
+enum ethtool_flags {
+ ETH_FLAG_TXVLAN = (1 << 7), /* TX VLAN offload enabled */
+ ETH_FLAG_RXVLAN = (1 << 8), /* RX VLAN offload enabled */
+ ETH_FLAG_LRO = (1 << 15), /* LRO is enabled */
+ ETH_FLAG_NTUPLE = (1 << 27), /* N-tuple filters enabled */
+ ETH_FLAG_RXHASH = (1 << 28),
+};
+
+/* The following structures are for supporting RX network flow
+ * classification and RX n-tuple configuration. Note, all multibyte
+ * fields, e.g., ip4src, ip4dst, psrc, pdst, spi, etc. are expected to
+ * be in network byte order.
+ */
+
+/**
+ * struct ethtool_tcpip4_spec - flow specification for TCP/IPv4 etc.
+ * @ip4src: Source host
+ * @ip4dst: Destination host
+ * @psrc: Source port
+ * @pdst: Destination port
+ * @tos: Type-of-service
+ *
+ * This can be used to specify a TCP/IPv4, UDP/IPv4 or SCTP/IPv4 flow.
+ */
+struct ethtool_tcpip4_spec {
+ __be32 ip4src;
+ __be32 ip4dst;
+ __be16 psrc;
+ __be16 pdst;
+ __u8 tos;
+};
+
+/**
+ * struct ethtool_ah_espip4_spec - flow specification for IPsec/IPv4
+ * @ip4src: Source host
+ * @ip4dst: Destination host
+ * @spi: Security parameters index
+ * @tos: Type-of-service
+ *
+ * This can be used to specify an IPsec transport or tunnel over IPv4.
+ */
+struct ethtool_ah_espip4_spec {
+ __be32 ip4src;
+ __be32 ip4dst;
+ __be32 spi;
+ __u8 tos;
+};
+
+#define ETH_RX_NFC_IP4 1
+
+/**
+ * struct ethtool_usrip4_spec - general flow specification for IPv4
+ * @ip4src: Source host
+ * @ip4dst: Destination host
+ * @l4_4_bytes: First 4 bytes of transport (layer 4) header
+ * @tos: Type-of-service
+ * @ip_ver: Value must be %ETH_RX_NFC_IP4; mask must be 0
+ * @proto: Transport protocol number; mask must be 0
+ */
+struct ethtool_usrip4_spec {
+ __be32 ip4src;
+ __be32 ip4dst;
+ __be32 l4_4_bytes;
+ __u8 tos;
+ __u8 ip_ver;
+ __u8 proto;
+};
+
+/**
+ * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc.
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @psrc: Source port
+ * @pdst: Destination port
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow.
+ */
+struct ethtool_tcpip6_spec {
+ __be32 ip6src[4];
+ __be32 ip6dst[4];
+ __be16 psrc;
+ __be16 pdst;
+ __u8 tclass;
+};
+
+/**
+ * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @spi: Security parameters index
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify an IPsec transport or tunnel over IPv6.
+ */
+struct ethtool_ah_espip6_spec {
+ __be32 ip6src[4];
+ __be32 ip6dst[4];
+ __be32 spi;
+ __u8 tclass;
+};
+
+/**
+ * struct ethtool_usrip6_spec - general flow specification for IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @l4_4_bytes: First 4 bytes of transport (layer 4) header
+ * @tclass: Traffic Class
+ * @l4_proto: Transport protocol number (nexthdr after any Extension Headers)
+ */
+struct ethtool_usrip6_spec {
+ __be32 ip6src[4];
+ __be32 ip6dst[4];
+ __be32 l4_4_bytes;
+ __u8 tclass;
+ __u8 l4_proto;
+};
+
+union ethtool_flow_union {
+ struct ethtool_tcpip4_spec tcp_ip4_spec;
+ struct ethtool_tcpip4_spec udp_ip4_spec;
+ struct ethtool_tcpip4_spec sctp_ip4_spec;
+ struct ethtool_ah_espip4_spec ah_ip4_spec;
+ struct ethtool_ah_espip4_spec esp_ip4_spec;
+ struct ethtool_usrip4_spec usr_ip4_spec;
+ struct ethtool_tcpip6_spec tcp_ip6_spec;
+ struct ethtool_tcpip6_spec udp_ip6_spec;
+ struct ethtool_tcpip6_spec sctp_ip6_spec;
+ struct ethtool_ah_espip6_spec ah_ip6_spec;
+ struct ethtool_ah_espip6_spec esp_ip6_spec;
+ struct ethtool_usrip6_spec usr_ip6_spec;
+ struct ethhdr ether_spec;
+ __u8 hdata[52];
+};
+
+/**
+ * struct ethtool_flow_ext - additional RX flow fields
+ * @h_dest: destination MAC address
+ * @vlan_etype: VLAN EtherType
+ * @vlan_tci: VLAN tag control information
+ * @data: user defined data
+ * @padding: Reserved for future use; see the note on reserved space.
+ *
+ * Note, @vlan_etype, @vlan_tci, and @data are only valid if %FLOW_EXT
+ * is set in &struct ethtool_rx_flow_spec @flow_type.
+ * @h_dest is valid if %FLOW_MAC_EXT is set.
+ */
+struct ethtool_flow_ext {
+ __u8 padding[2];
+ unsigned char h_dest[ETH_ALEN];
+ __be16 vlan_etype;
+ __be16 vlan_tci;
+ __be32 data[2];
+};
+
+/**
+ * struct ethtool_rx_flow_spec - classification rule for RX flows
+ * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
+ * @h_u: Flow fields to match (dependent on @flow_type)
+ * @h_ext: Additional fields to match
+ * @m_u: Masks for flow field bits to be matched
+ * @m_ext: Masks for additional field bits to be matched
+ * Note, all additional fields must be ignored unless @flow_type
+ * includes the %FLOW_EXT or %FLOW_MAC_EXT flag
+ * (see &struct ethtool_flow_ext description).
+ * @ring_cookie: RX ring/queue index to deliver to, or %RX_CLS_FLOW_DISC
+ * if packets should be discarded, or %RX_CLS_FLOW_WAKE if the
+ * packets should be used for Wake-on-LAN with %WAKE_FILTER
+ * @location: Location of rule in the table. Locations must be
+ * numbered such that a flow matching multiple rules will be
+ * classified according to the first (lowest numbered) rule.
+ */
+struct ethtool_rx_flow_spec {
+ __u32 flow_type;
+ union ethtool_flow_union h_u;
+ struct ethtool_flow_ext h_ext;
+ union ethtool_flow_union m_u;
+ struct ethtool_flow_ext m_ext;
+ __u64 ring_cookie;
+ __u32 location;
+};
+
+/* How rings are laid out when accessing virtual functions or
+ * offloaded queues is device specific. To allow users to do flow
+ * steering and specify these queues the ring cookie is partitioned
+ * into a 32-bit queue index with an 8 bit virtual function id.
+ * This also leaves the 3bytes for further specifiers. It is possible
+ * future devices may support more than 256 virtual functions if
+ * devices start supporting PCIe w/ARI. However at the moment I
+ * do not know of any devices that support this so I do not reserve
+ * space for this at this time. If a future patch consumes the next
+ * byte it should be aware of this possibility.
+ */
+#define ETHTOOL_RX_FLOW_SPEC_RING 0x00000000FFFFFFFFLL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF 0x000000FF00000000LL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32
+static __inline__ __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
+{
+ return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie;
+}
+
+static __inline__ __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie)
+{
+ return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >>
+ ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+}
+
+/**
+ * struct ethtool_rxnfc - command to get or set RX flow classification rules
+ * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH,
+ * %ETHTOOL_GRXRINGS, %ETHTOOL_GRXCLSRLCNT, %ETHTOOL_GRXCLSRULE,
+ * %ETHTOOL_GRXCLSRLALL, %ETHTOOL_SRXCLSRLDEL or %ETHTOOL_SRXCLSRLINS
+ * @flow_type: Type of flow to be affected, e.g. %TCP_V4_FLOW
+ * @data: Command-dependent value
+ * @fs: Flow classification rule
+ * @rss_context: RSS context to be affected
+ * @rule_cnt: Number of rules to be affected
+ * @rule_locs: Array of used rule locations
+ *
+ * For %ETHTOOL_GRXFH and %ETHTOOL_SRXFH, @data is a bitmask indicating
+ * the fields included in the flow hash, e.g. %RXH_IP_SRC. The following
+ * structure fields must not be used, except that if @flow_type includes
+ * the %FLOW_RSS flag, then @rss_context determines which RSS context to
+ * act on.
+ *
+ * For %ETHTOOL_GRXRINGS, @data is set to the number of RX rings/queues
+ * on return.
+ *
+ * For %ETHTOOL_GRXCLSRLCNT, @rule_cnt is set to the number of defined
+ * rules on return. If @data is non-zero on return then it is the
+ * size of the rule table, plus the flag %RX_CLS_LOC_SPECIAL if the
+ * driver supports any special location values. If that flag is not
+ * set in @data then special location values should not be used.
+ *
+ * For %ETHTOOL_GRXCLSRULE, @fs.@location specifies the location of an
+ * existing rule on entry and @fs contains the rule on return; if
+ * @fs.@flow_type includes the %FLOW_RSS flag, then @rss_context is
+ * filled with the RSS context ID associated with the rule.
+ *
+ * For %ETHTOOL_GRXCLSRLALL, @rule_cnt specifies the array size of the
+ * user buffer for @rule_locs on entry. On return, @data is the size
+ * of the rule table, @rule_cnt is the number of defined rules, and
+ * @rule_locs contains the locations of the defined rules. Drivers
+ * must use the second parameter to get_rxnfc() instead of @rule_locs.
+ *
+ * For %ETHTOOL_SRXCLSRLINS, @fs specifies the rule to add or update.
+ * @fs.@location either specifies the location to use or is a special
+ * location value with %RX_CLS_LOC_SPECIAL flag set. On return,
+ * @fs.@location is the actual rule location. If @fs.@flow_type
+ * includes the %FLOW_RSS flag, @rss_context is the RSS context ID to
+ * use for flow spreading traffic which matches this rule. The value
+ * from the rxfh indirection table will be added to @fs.@ring_cookie
+ * to choose which ring to deliver to.
+ *
+ * For %ETHTOOL_SRXCLSRLDEL, @fs.@location specifies the location of an
+ * existing rule on entry.
+ *
+ * A driver supporting the special location values for
+ * %ETHTOOL_SRXCLSRLINS may add the rule at any suitable unused
+ * location, and may remove a rule at a later location (lower
+ * priority) that matches exactly the same set of flows. The special
+ * values are %RX_CLS_LOC_ANY, selecting any location;
+ * %RX_CLS_LOC_FIRST, selecting the first suitable location (maximum
+ * priority); and %RX_CLS_LOC_LAST, selecting the last suitable
+ * location (minimum priority). Additional special values may be
+ * defined in future and drivers must return -%EINVAL for any
+ * unrecognised value.
+ */
+struct ethtool_rxnfc {
+ __u32 cmd;
+ __u32 flow_type;
+ __u64 data;
+ struct ethtool_rx_flow_spec fs;
+ union {
+ __u32 rule_cnt;
+ __u32 rss_context;
+ };
+ __u32 rule_locs[0];
+};
+
+
+/**
+ * struct ethtool_rxfh_indir - command to get or set RX flow hash indirection
+ * @cmd: Specific command number - %ETHTOOL_GRXFHINDIR or %ETHTOOL_SRXFHINDIR
+ * @size: On entry, the array size of the user buffer, which may be zero.
+ * On return from %ETHTOOL_GRXFHINDIR, the array size of the hardware
+ * indirection table.
+ * @ring_index: RX ring/queue index for each hash value
+ *
+ * For %ETHTOOL_GRXFHINDIR, a @size of zero means that only the size
+ * should be returned. For %ETHTOOL_SRXFHINDIR, a @size of zero means
+ * the table should be reset to default values. This last feature
+ * is not supported by the original implementations.
+ */
+struct ethtool_rxfh_indir {
+ __u32 cmd;
+ __u32 size;
+ __u32 ring_index[];
+};
+
+/**
+ * struct ethtool_rxfh - command to get/set RX flow hash indir or/and hash key.
+ * @cmd: Specific command number - %ETHTOOL_GRSSH or %ETHTOOL_SRSSH
+ * @rss_context: RSS context identifier. Context 0 is the default for normal
+ * traffic; other contexts can be referenced as the destination for RX flow
+ * classification rules. %ETH_RXFH_CONTEXT_ALLOC is used with command
+ * %ETHTOOL_SRSSH to allocate a new RSS context; on return this field will
+ * contain the ID of the newly allocated context.
+ * @indir_size: On entry, the array size of the user buffer for the
+ * indirection table, which may be zero, or (for %ETHTOOL_SRSSH),
+ * %ETH_RXFH_INDIR_NO_CHANGE. On return from %ETHTOOL_GRSSH,
+ * the array size of the hardware indirection table.
+ * @key_size: On entry, the array size of the user buffer for the hash key,
+ * which may be zero. On return from %ETHTOOL_GRSSH, the size of the
+ * hardware hash key.
+ * @hfunc: Defines the current RSS hash function used by HW (or to be set to).
+ * Valid values are one of the %ETH_RSS_HASH_*.
+ * @rsvd8: Reserved for future use; see the note on reserved space.
+ * @rsvd32: Reserved for future use; see the note on reserved space.
+ * @rss_config: RX ring/queue index for each hash value i.e., indirection table
+ * of @indir_size __u32 elements, followed by hash key of @key_size
+ * bytes.
+ *
+ * For %ETHTOOL_GRSSH, a @indir_size and key_size of zero means that only the
+ * size should be returned. For %ETHTOOL_SRSSH, an @indir_size of
+ * %ETH_RXFH_INDIR_NO_CHANGE means that indir table setting is not requested
+ * and a @indir_size of zero means the indir table should be reset to default
+ * values (if @rss_context == 0) or that the RSS context should be deleted.
+ * An hfunc of zero means that hash function setting is not requested.
+ */
+struct ethtool_rxfh {
+ __u32 cmd;
+ __u32 rss_context;
+ __u32 indir_size;
+ __u32 key_size;
+ __u8 hfunc;
+ __u8 rsvd8[3];
+ __u32 rsvd32;
+ __u32 rss_config[];
+};
+#define ETH_RXFH_CONTEXT_ALLOC 0xffffffff
+#define ETH_RXFH_INDIR_NO_CHANGE 0xffffffff
+
+/**
+ * struct ethtool_rx_ntuple_flow_spec - specification for RX flow filter
+ * @flow_type: Type of match to perform, e.g. %TCP_V4_FLOW
+ * @h_u: Flow field values to match (dependent on @flow_type)
+ * @m_u: Masks for flow field value bits to be ignored
+ * @vlan_tag: VLAN tag to match
+ * @vlan_tag_mask: Mask for VLAN tag bits to be ignored
+ * @data: Driver-dependent data to match
+ * @data_mask: Mask for driver-dependent data bits to be ignored
+ * @action: RX ring/queue index to deliver to (non-negative) or other action
+ * (negative, e.g. %ETHTOOL_RXNTUPLE_ACTION_DROP)
+ *
+ * For flow types %TCP_V4_FLOW, %UDP_V4_FLOW and %SCTP_V4_FLOW, where
+ * a field value and mask are both zero this is treated as if all mask
+ * bits are set i.e. the field is ignored.
+ */
+struct ethtool_rx_ntuple_flow_spec {
+ __u32 flow_type;
+ union {
+ struct ethtool_tcpip4_spec tcp_ip4_spec;
+ struct ethtool_tcpip4_spec udp_ip4_spec;
+ struct ethtool_tcpip4_spec sctp_ip4_spec;
+ struct ethtool_ah_espip4_spec ah_ip4_spec;
+ struct ethtool_ah_espip4_spec esp_ip4_spec;
+ struct ethtool_usrip4_spec usr_ip4_spec;
+ struct ethhdr ether_spec;
+ __u8 hdata[72];
+ } h_u, m_u;
+
+ __u16 vlan_tag;
+ __u16 vlan_tag_mask;
+ __u64 data;
+ __u64 data_mask;
+
+ __s32 action;
+#define ETHTOOL_RXNTUPLE_ACTION_DROP (-1) /* drop packet */
+#define ETHTOOL_RXNTUPLE_ACTION_CLEAR (-2) /* clear filter */
+};
+
+/**
+ * struct ethtool_rx_ntuple - command to set or clear RX flow filter
+ * @cmd: Command number - %ETHTOOL_SRXNTUPLE
+ * @fs: Flow filter specification
+ */
+struct ethtool_rx_ntuple {
+ __u32 cmd;
+ struct ethtool_rx_ntuple_flow_spec fs;
+};
+
+#define ETHTOOL_FLASH_MAX_FILENAME 128
+enum ethtool_flash_op_type {
+ ETHTOOL_FLASH_ALL_REGIONS = 0,
+};
+
+/* for passing firmware flashing related parameters */
+struct ethtool_flash {
+ __u32 cmd;
+ __u32 region;
+ char data[ETHTOOL_FLASH_MAX_FILENAME];
+};
+
+/**
+ * struct ethtool_dump - used for retrieving, setting device dump
+ * @cmd: Command number - %ETHTOOL_GET_DUMP_FLAG, %ETHTOOL_GET_DUMP_DATA, or
+ * %ETHTOOL_SET_DUMP
+ * @version: FW version of the dump, filled in by driver
+ * @flag: driver dependent flag for dump setting, filled in by driver during
+ * get and filled in by ethtool for set operation.
+ * flag must be initialized by macro ETH_FW_DUMP_DISABLE value when
+ * firmware dump is disabled.
+ * @len: length of dump data, used as the length of the user buffer on entry to
+ * %ETHTOOL_GET_DUMP_DATA and this is returned as dump length by driver
+ * for %ETHTOOL_GET_DUMP_FLAG command
+ * @data: data collected for get dump data operation
+ */
+struct ethtool_dump {
+ __u32 cmd;
+ __u32 version;
+ __u32 flag;
+ __u32 len;
+ __u8 data[];
+};
+
+#define ETH_FW_DUMP_DISABLE 0
+
+/* for returning and changing feature sets */
+
+/**
+ * struct ethtool_get_features_block - block with state of 32 features
+ * @available: mask of changeable features
+ * @requested: mask of features requested to be enabled if possible
+ * @active: mask of currently enabled features
+ * @never_changed: mask of features not changeable for any device
+ */
+struct ethtool_get_features_block {
+ __u32 available;
+ __u32 requested;
+ __u32 active;
+ __u32 never_changed;
+};
+
+/**
+ * struct ethtool_gfeatures - command to get state of device's features
+ * @cmd: command number = %ETHTOOL_GFEATURES
+ * @size: On entry, the number of elements in the features[] array;
+ * on return, the number of elements in features[] needed to hold
+ * all features
+ * @features: state of features
+ */
+struct ethtool_gfeatures {
+ __u32 cmd;
+ __u32 size;
+ struct ethtool_get_features_block features[];
+};
+
+/**
+ * struct ethtool_set_features_block - block with request for 32 features
+ * @valid: mask of features to be changed
+ * @requested: values of features to be changed
+ */
+struct ethtool_set_features_block {
+ __u32 valid;
+ __u32 requested;
+};
+
+/**
+ * struct ethtool_sfeatures - command to request change in device's features
+ * @cmd: command number = %ETHTOOL_SFEATURES
+ * @size: array size of the features[] array
+ * @features: feature change masks
+ */
+struct ethtool_sfeatures {
+ __u32 cmd;
+ __u32 size;
+ struct ethtool_set_features_block features[];
+};
+
+/**
+ * struct ethtool_ts_info - holds a device's timestamping and PHC association
+ * @cmd: command number = %ETHTOOL_GET_TS_INFO
+ * @so_timestamping: bit mask of the sum of the supported SO_TIMESTAMPING flags
+ * @phc_index: device index of the associated PHC, or -1 if there is none
+ * @tx_types: bit mask of the supported hwtstamp_tx_types enumeration values
+ * @tx_reserved: Reserved for future use; see the note on reserved space.
+ * @rx_filters: bit mask of the supported hwtstamp_rx_filters enumeration values
+ * @rx_reserved: Reserved for future use; see the note on reserved space.
+ *
+ * The bits in the 'tx_types' and 'rx_filters' fields correspond to
+ * the 'hwtstamp_tx_types' and 'hwtstamp_rx_filters' enumeration values,
+ * respectively. For example, if the device supports HWTSTAMP_TX_ON,
+ * then (1 << HWTSTAMP_TX_ON) in 'tx_types' will be set.
+ *
+ * Drivers should only report the filters they actually support without
+ * upscaling in the SIOCSHWTSTAMP ioctl. If the SIOCSHWSTAMP request for
+ * HWTSTAMP_FILTER_V1_SYNC is supported by HWTSTAMP_FILTER_V1_EVENT, then the
+ * driver should only report HWTSTAMP_FILTER_V1_EVENT in this op.
+ */
+struct ethtool_ts_info {
+ __u32 cmd;
+ __u32 so_timestamping;
+ __s32 phc_index;
+ __u32 tx_types;
+ __u32 tx_reserved[3];
+ __u32 rx_filters;
+ __u32 rx_reserved[3];
+};
+
+/*
+ * %ETHTOOL_SFEATURES changes features present in features[].valid to the
+ * values of corresponding bits in features[].requested. Bits in .requested
+ * not set in .valid or not changeable are ignored.
+ *
+ * Returns %EINVAL when .valid contains undefined or never-changeable bits
+ * or size is not equal to required number of features words (32-bit blocks).
+ * Returns >= 0 if request was completed; bits set in the value mean:
+ * %ETHTOOL_F_UNSUPPORTED - there were bits set in .valid that are not
+ * changeable (not present in %ETHTOOL_GFEATURES' features[].available)
+ * those bits were ignored.
+ * %ETHTOOL_F_WISH - some or all changes requested were recorded but the
+ * resulting state of bits masked by .valid is not equal to .requested.
+ * Probably there are other device-specific constraints on some features
+ * in the set. When %ETHTOOL_F_UNSUPPORTED is set, .valid is considered
+ * here as though ignored bits were cleared.
+ * %ETHTOOL_F_COMPAT - some or all changes requested were made by calling
+ * compatibility functions. Requested offload state cannot be properly
+ * managed by kernel.
+ *
+ * Meaning of bits in the masks are obtained by %ETHTOOL_GSSET_INFO (number of
+ * bits in the arrays - always multiple of 32) and %ETHTOOL_GSTRINGS commands
+ * for ETH_SS_FEATURES string set. First entry in the table corresponds to least
+ * significant bit in features[0] fields. Empty strings mark undefined features.
+ */
+enum ethtool_sfeatures_retval_bits {
+ ETHTOOL_F_UNSUPPORTED__BIT,
+ ETHTOOL_F_WISH__BIT,
+ ETHTOOL_F_COMPAT__BIT,
+};
+
+#define ETHTOOL_F_UNSUPPORTED (1 << ETHTOOL_F_UNSUPPORTED__BIT)
+#define ETHTOOL_F_WISH (1 << ETHTOOL_F_WISH__BIT)
+#define ETHTOOL_F_COMPAT (1 << ETHTOOL_F_COMPAT__BIT)
+
+#define MAX_NUM_QUEUE 4096
+
+/**
+ * struct ethtool_per_queue_op - apply sub command to the queues in mask.
+ * @cmd: ETHTOOL_PERQUEUE
+ * @sub_command: the sub command which apply to each queues
+ * @queue_mask: Bitmap of the queues which sub command apply to
+ * @data: A complete command structure following for each of the queues addressed
+ */
+struct ethtool_per_queue_op {
+ __u32 cmd;
+ __u32 sub_command;
+ __u32 queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+ char data[];
+};
+
+/**
+ * struct ethtool_fecparam - Ethernet Forward Error Correction parameters
+ * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM
+ * @active_fec: FEC mode which is active on the port, single bit set, GET only.
+ * @fec: Bitmask of configured FEC modes.
+ * @reserved: Reserved for future extensions, ignore on GET, write 0 for SET.
+ *
+ * Note that @reserved was never validated on input and ethtool user space
+ * left it uninitialized when calling SET. Hence going forward it can only be
+ * used to return a value to userspace with GET.
+ *
+ * FEC modes supported by the device can be read via %ETHTOOL_GLINKSETTINGS.
+ * FEC settings are configured by link autonegotiation whenever it's enabled.
+ * With autoneg on %ETHTOOL_GFECPARAM can be used to read the current mode.
+ *
+ * When autoneg is disabled %ETHTOOL_SFECPARAM controls the FEC settings.
+ * It is recommended that drivers only accept a single bit set in @fec.
+ * When multiple bits are set in @fec drivers may pick mode in an implementation
+ * dependent way. Drivers should reject mixing %ETHTOOL_FEC_AUTO_BIT with other
+ * FEC modes, because it's unclear whether in this case other modes constrain
+ * AUTO or are independent choices.
+ * Drivers must reject SET requests if they support none of the requested modes.
+ *
+ * If device does not support FEC drivers may use %ETHTOOL_FEC_NONE instead
+ * of returning %EOPNOTSUPP from %ETHTOOL_GFECPARAM.
+ *
+ * See enum ethtool_fec_config_bits for definition of valid bits for both
+ * @fec and @active_fec.
+ */
+struct ethtool_fecparam {
+ __u32 cmd;
+ /* bitmask of FEC modes */
+ __u32 active_fec;
+ __u32 fec;
+ __u32 reserved;
+};
+
+/**
+ * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration
+ * @ETHTOOL_FEC_NONE_BIT: FEC mode configuration is not supported. Should not
+ * be used together with other bits. GET only.
+ * @ETHTOOL_FEC_AUTO_BIT: Select default/best FEC mode automatically, usually
+ * based link mode and SFP parameters read from module's
+ * EEPROM. This bit does _not_ mean autonegotiation.
+ * @ETHTOOL_FEC_OFF_BIT: No FEC Mode
+ * @ETHTOOL_FEC_RS_BIT: Reed-Solomon FEC Mode
+ * @ETHTOOL_FEC_BASER_BIT: Base-R/Reed-Solomon FEC Mode
+ * @ETHTOOL_FEC_LLRS_BIT: Low Latency Reed Solomon FEC Mode (25G/50G Ethernet
+ * Consortium)
+ */
+enum ethtool_fec_config_bits {
+ ETHTOOL_FEC_NONE_BIT,
+ ETHTOOL_FEC_AUTO_BIT,
+ ETHTOOL_FEC_OFF_BIT,
+ ETHTOOL_FEC_RS_BIT,
+ ETHTOOL_FEC_BASER_BIT,
+ ETHTOOL_FEC_LLRS_BIT,
+};
+
+#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT)
+#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT)
+#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT)
+#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT)
+#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT)
+#define ETHTOOL_FEC_LLRS (1 << ETHTOOL_FEC_LLRS_BIT)
+
+/* CMDs currently supported */
+#define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings.
+ * Please use ETHTOOL_GLINKSETTINGS
+ */
+#define ETHTOOL_SSET 0x00000002 /* DEPRECATED, Set settings.
+ * Please use ETHTOOL_SLINKSETTINGS
+ */
+#define ETHTOOL_GDRVINFO 0x00000003 /* Get driver info. */
+#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers. */
+#define ETHTOOL_GWOL 0x00000005 /* Get wake-on-lan options. */
+#define ETHTOOL_SWOL 0x00000006 /* Set wake-on-lan options. */
+#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */
+#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level. */
+#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation. */
+/* Get link status for host, i.e. whether the interface *and* the
+ * physical port (if there is one) are up (ethtool_value). */
+#define ETHTOOL_GLINK 0x0000000a
+#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */
+#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data. */
+#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */
+#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */
+#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */
+#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters. */
+#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */
+#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */
+#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */
+#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */
+#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */
+#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */
+#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable
+ * (ethtool_value) */
+#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable
+ * (ethtool_value). */
+#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test. */
+#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */
+#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */
+#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */
+#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */
+#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */
+#define ETHTOOL_GPERMADDR 0x00000020 /* Get permanent hardware address */
+#define ETHTOOL_GUFO 0x00000021 /* Get UFO enable (ethtool_value) */
+#define ETHTOOL_SUFO 0x00000022 /* Set UFO enable (ethtool_value) */
+#define ETHTOOL_GGSO 0x00000023 /* Get GSO enable (ethtool_value) */
+#define ETHTOOL_SGSO 0x00000024 /* Set GSO enable (ethtool_value) */
+#define ETHTOOL_GFLAGS 0x00000025 /* Get flags bitmap(ethtool_value) */
+#define ETHTOOL_SFLAGS 0x00000026 /* Set flags bitmap(ethtool_value) */
+#define ETHTOOL_GPFLAGS 0x00000027 /* Get driver-private flags bitmap */
+#define ETHTOOL_SPFLAGS 0x00000028 /* Set driver-private flags bitmap */
+
+#define ETHTOOL_GRXFH 0x00000029 /* Get RX flow hash configuration */
+#define ETHTOOL_SRXFH 0x0000002a /* Set RX flow hash configuration */
+#define ETHTOOL_GGRO 0x0000002b /* Get GRO enable (ethtool_value) */
+#define ETHTOOL_SGRO 0x0000002c /* Set GRO enable (ethtool_value) */
+#define ETHTOOL_GRXRINGS 0x0000002d /* Get RX rings available for LB */
+#define ETHTOOL_GRXCLSRLCNT 0x0000002e /* Get RX class rule count */
+#define ETHTOOL_GRXCLSRULE 0x0000002f /* Get RX classification rule */
+#define ETHTOOL_GRXCLSRLALL 0x00000030 /* Get all RX classification rule */
+#define ETHTOOL_SRXCLSRLDEL 0x00000031 /* Delete RX classification rule */
+#define ETHTOOL_SRXCLSRLINS 0x00000032 /* Insert RX classification rule */
+#define ETHTOOL_FLASHDEV 0x00000033 /* Flash firmware to device */
+#define ETHTOOL_RESET 0x00000034 /* Reset hardware */
+#define ETHTOOL_SRXNTUPLE 0x00000035 /* Add an n-tuple filter to device */
+#define ETHTOOL_GRXNTUPLE 0x00000036 /* deprecated */
+#define ETHTOOL_GSSET_INFO 0x00000037 /* Get string set info */
+#define ETHTOOL_GRXFHINDIR 0x00000038 /* Get RX flow hash indir'n table */
+#define ETHTOOL_SRXFHINDIR 0x00000039 /* Set RX flow hash indir'n table */
+
+#define ETHTOOL_GFEATURES 0x0000003a /* Get device offload settings */
+#define ETHTOOL_SFEATURES 0x0000003b /* Change device offload settings */
+#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */
+#define ETHTOOL_SCHANNELS 0x0000003d /* Set no of channels */
+#define ETHTOOL_SET_DUMP 0x0000003e /* Set dump settings */
+#define ETHTOOL_GET_DUMP_FLAG 0x0000003f /* Get dump settings */
+#define ETHTOOL_GET_DUMP_DATA 0x00000040 /* Get dump data */
+#define ETHTOOL_GET_TS_INFO 0x00000041 /* Get time stamping and PHC info */
+#define ETHTOOL_GMODULEINFO 0x00000042 /* Get plug-in module information */
+#define ETHTOOL_GMODULEEEPROM 0x00000043 /* Get plug-in module eeprom */
+#define ETHTOOL_GEEE 0x00000044 /* Get EEE settings */
+#define ETHTOOL_SEEE 0x00000045 /* Set EEE settings */
+
+#define ETHTOOL_GRSSH 0x00000046 /* Get RX flow hash configuration */
+#define ETHTOOL_SRSSH 0x00000047 /* Set RX flow hash configuration */
+#define ETHTOOL_GTUNABLE 0x00000048 /* Get tunable configuration */
+#define ETHTOOL_STUNABLE 0x00000049 /* Set tunable configuration */
+#define ETHTOOL_GPHYSTATS 0x0000004a /* get PHY-specific statistics */
+
+#define ETHTOOL_PERQUEUE 0x0000004b /* Set per queue options */
+
+#define ETHTOOL_GLINKSETTINGS 0x0000004c /* Get ethtool_link_settings */
+#define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */
+#define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */
+#define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */
+#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */
+#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */
+
+/* compatibility with older code */
+#define SPARC_ETH_GSET ETHTOOL_GSET
+#define SPARC_ETH_SSET ETHTOOL_SSET
+
+/* Link mode bit indices */
+enum ethtool_link_mode_bit_indices {
+ ETHTOOL_LINK_MODE_10baseT_Half_BIT = 0,
+ ETHTOOL_LINK_MODE_10baseT_Full_BIT = 1,
+ ETHTOOL_LINK_MODE_100baseT_Half_BIT = 2,
+ ETHTOOL_LINK_MODE_100baseT_Full_BIT = 3,
+ ETHTOOL_LINK_MODE_1000baseT_Half_BIT = 4,
+ ETHTOOL_LINK_MODE_1000baseT_Full_BIT = 5,
+ ETHTOOL_LINK_MODE_Autoneg_BIT = 6,
+ ETHTOOL_LINK_MODE_TP_BIT = 7,
+ ETHTOOL_LINK_MODE_AUI_BIT = 8,
+ ETHTOOL_LINK_MODE_MII_BIT = 9,
+ ETHTOOL_LINK_MODE_FIBRE_BIT = 10,
+ ETHTOOL_LINK_MODE_BNC_BIT = 11,
+ ETHTOOL_LINK_MODE_10000baseT_Full_BIT = 12,
+ ETHTOOL_LINK_MODE_Pause_BIT = 13,
+ ETHTOOL_LINK_MODE_Asym_Pause_BIT = 14,
+ ETHTOOL_LINK_MODE_2500baseX_Full_BIT = 15,
+ ETHTOOL_LINK_MODE_Backplane_BIT = 16,
+ ETHTOOL_LINK_MODE_1000baseKX_Full_BIT = 17,
+ ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT = 18,
+ ETHTOOL_LINK_MODE_10000baseKR_Full_BIT = 19,
+ ETHTOOL_LINK_MODE_10000baseR_FEC_BIT = 20,
+ ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21,
+ ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT = 22,
+ ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT = 23,
+ ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT = 24,
+ ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT = 25,
+ ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT = 26,
+ ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT = 27,
+ ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT = 28,
+ ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29,
+ ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30,
+ ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31,
+
+ /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
+ * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
+ * macro for bits > 31. The only way to use indices > 31 is to
+ * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
+ */
+
+ ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32,
+ ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33,
+ ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34,
+ ETHTOOL_LINK_MODE_50000baseKR2_Full_BIT = 35,
+ ETHTOOL_LINK_MODE_100000baseKR4_Full_BIT = 36,
+ ETHTOOL_LINK_MODE_100000baseSR4_Full_BIT = 37,
+ ETHTOOL_LINK_MODE_100000baseCR4_Full_BIT = 38,
+ ETHTOOL_LINK_MODE_100000baseLR4_ER4_Full_BIT = 39,
+ ETHTOOL_LINK_MODE_50000baseSR2_Full_BIT = 40,
+ ETHTOOL_LINK_MODE_1000baseX_Full_BIT = 41,
+ ETHTOOL_LINK_MODE_10000baseCR_Full_BIT = 42,
+ ETHTOOL_LINK_MODE_10000baseSR_Full_BIT = 43,
+ ETHTOOL_LINK_MODE_10000baseLR_Full_BIT = 44,
+ ETHTOOL_LINK_MODE_10000baseLRM_Full_BIT = 45,
+ ETHTOOL_LINK_MODE_10000baseER_Full_BIT = 46,
+ ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47,
+ ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48,
+
+ ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49,
+ ETHTOOL_LINK_MODE_FEC_RS_BIT = 50,
+ ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51,
+ ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52,
+ ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53,
+ ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54,
+ ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55,
+ ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56,
+ ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57,
+ ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58,
+ ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59,
+ ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60,
+ ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61,
+ ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62,
+ ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63,
+ ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64,
+ ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65,
+ ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66,
+ ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67,
+ ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68,
+ ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69,
+ ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70,
+ ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71,
+ ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72,
+ ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73,
+ ETHTOOL_LINK_MODE_FEC_LLRS_BIT = 74,
+ ETHTOOL_LINK_MODE_100000baseKR_Full_BIT = 75,
+ ETHTOOL_LINK_MODE_100000baseSR_Full_BIT = 76,
+ ETHTOOL_LINK_MODE_100000baseLR_ER_FR_Full_BIT = 77,
+ ETHTOOL_LINK_MODE_100000baseCR_Full_BIT = 78,
+ ETHTOOL_LINK_MODE_100000baseDR_Full_BIT = 79,
+ ETHTOOL_LINK_MODE_200000baseKR2_Full_BIT = 80,
+ ETHTOOL_LINK_MODE_200000baseSR2_Full_BIT = 81,
+ ETHTOOL_LINK_MODE_200000baseLR2_ER2_FR2_Full_BIT = 82,
+ ETHTOOL_LINK_MODE_200000baseDR2_Full_BIT = 83,
+ ETHTOOL_LINK_MODE_200000baseCR2_Full_BIT = 84,
+ ETHTOOL_LINK_MODE_400000baseKR4_Full_BIT = 85,
+ ETHTOOL_LINK_MODE_400000baseSR4_Full_BIT = 86,
+ ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87,
+ ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88,
+ ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89,
+ ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90,
+ ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91,
+ ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92,
+ ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93,
+ ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94,
+ ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95,
+ ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96,
+ ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97,
+ ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98,
+
+ /* must be last entry */
+ __ETHTOOL_LINK_MODE_MASK_NBITS
+};
+
+#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \
+ (1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT))
+
+/* DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new SUPPORTED_* macro for bits > 31.
+ */
+#define SUPPORTED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define SUPPORTED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define SUPPORTED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define SUPPORTED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define SUPPORTED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define SUPPORTED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define SUPPORTED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define SUPPORTED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define SUPPORTED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define SUPPORTED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define SUPPORTED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define SUPPORTED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define SUPPORTED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define SUPPORTED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define SUPPORTED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define SUPPORTED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define SUPPORTED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define SUPPORTED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define SUPPORTED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define SUPPORTED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define SUPPORTED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define SUPPORTED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define SUPPORTED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define SUPPORTED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define SUPPORTED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define SUPPORTED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define SUPPORTED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define SUPPORTED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define SUPPORTED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define SUPPORTED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define SUPPORTED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new SUPPORTED_* macro for bits > 31, see
+ * notice above.
+ */
+
+/*
+ * DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new ADERTISE_* macro for bits > 31.
+ */
+#define ADVERTISED_10baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define ADVERTISED_10baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define ADVERTISED_100baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define ADVERTISED_100baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define ADVERTISED_1000baseT_Half __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define ADVERTISED_1000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define ADVERTISED_Autoneg __ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define ADVERTISED_TP __ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define ADVERTISED_AUI __ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define ADVERTISED_MII __ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define ADVERTISED_FIBRE __ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define ADVERTISED_BNC __ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define ADVERTISED_10000baseT_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define ADVERTISED_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define ADVERTISED_Asym_Pause __ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define ADVERTISED_2500baseX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define ADVERTISED_Backplane __ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define ADVERTISED_1000baseKX_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define ADVERTISED_10000baseKX4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define ADVERTISED_10000baseKR_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define ADVERTISED_10000baseR_FEC __ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define ADVERTISED_20000baseMLD2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define ADVERTISED_20000baseKR2_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define ADVERTISED_40000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define ADVERTISED_40000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define ADVERTISED_40000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define ADVERTISED_40000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define ADVERTISED_56000baseKR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define ADVERTISED_56000baseCR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define ADVERTISED_56000baseSR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define ADVERTISED_56000baseLR4_Full __ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new ADVERTISED_* macro for bits > 31, see
+ * notice above.
+ */
+
+/* The following are all involved in forcing a particular link
+ * mode for the device for setting things. When getting the
+ * devices settings, these indicate the current mode and whether
+ * it was forced up into this mode or autonegotiated.
+ */
+
+/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal.
+ * Update drivers/net/phy/phy.c:phy_speed_to_str() and
+ * drivers/net/bonding/bond_3ad.c:__get_link_speed() when adding new values.
+ */
+#define SPEED_10 10
+#define SPEED_100 100
+#define SPEED_1000 1000
+#define SPEED_2500 2500
+#define SPEED_5000 5000
+#define SPEED_10000 10000
+#define SPEED_14000 14000
+#define SPEED_20000 20000
+#define SPEED_25000 25000
+#define SPEED_40000 40000
+#define SPEED_50000 50000
+#define SPEED_56000 56000
+#define SPEED_100000 100000
+#define SPEED_200000 200000
+#define SPEED_400000 400000
+#define SPEED_800000 800000
+
+#define SPEED_UNKNOWN -1
+
+static __inline__ int ethtool_validate_speed(__u32 speed)
+{
+ return speed <= INT_MAX || speed == (__u32)SPEED_UNKNOWN;
+}
+
+/* Duplex, half or full. */
+#define DUPLEX_HALF 0x00
+#define DUPLEX_FULL 0x01
+#define DUPLEX_UNKNOWN 0xff
+
+static __inline__ int ethtool_validate_duplex(__u8 duplex)
+{
+ switch (duplex) {
+ case DUPLEX_HALF:
+ case DUPLEX_FULL:
+ case DUPLEX_UNKNOWN:
+ return 1;
+ }
+
+ return 0;
+}
+
+#define MASTER_SLAVE_CFG_UNSUPPORTED 0
+#define MASTER_SLAVE_CFG_UNKNOWN 1
+#define MASTER_SLAVE_CFG_MASTER_PREFERRED 2
+#define MASTER_SLAVE_CFG_SLAVE_PREFERRED 3
+#define MASTER_SLAVE_CFG_MASTER_FORCE 4
+#define MASTER_SLAVE_CFG_SLAVE_FORCE 5
+#define MASTER_SLAVE_STATE_UNSUPPORTED 0
+#define MASTER_SLAVE_STATE_UNKNOWN 1
+#define MASTER_SLAVE_STATE_MASTER 2
+#define MASTER_SLAVE_STATE_SLAVE 3
+#define MASTER_SLAVE_STATE_ERR 4
+
+/* These are used to throttle the rate of data on the phy interface when the
+ * native speed of the interface is higher than the link speed. These should
+ * not be used for phy interfaces which natively support multiple speeds (e.g.
+ * MII or SGMII).
+ */
+/* No rate matching performed. */
+#define RATE_MATCH_NONE 0
+/* The phy sends pause frames to throttle the MAC. */
+#define RATE_MATCH_PAUSE 1
+/* The phy asserts CRS to prevent the MAC from transmitting. */
+#define RATE_MATCH_CRS 2
+/* The MAC is programmed with a sufficiently-large IPG. */
+#define RATE_MATCH_OPEN_LOOP 3
+
+/* Which connector port. */
+#define PORT_TP 0x00
+#define PORT_AUI 0x01
+#define PORT_MII 0x02
+#define PORT_FIBRE 0x03
+#define PORT_BNC 0x04
+#define PORT_DA 0x05
+#define PORT_NONE 0xef
+#define PORT_OTHER 0xff
+
+/* Which transceiver to use. */
+#define XCVR_INTERNAL 0x00 /* PHY and MAC are in the same package */
+#define XCVR_EXTERNAL 0x01 /* PHY and MAC are in different packages */
+#define XCVR_DUMMY1 0x02
+#define XCVR_DUMMY2 0x03
+#define XCVR_DUMMY3 0x04
+
+/* Enable or disable autonegotiation. */
+#define AUTONEG_DISABLE 0x00
+#define AUTONEG_ENABLE 0x01
+
+/* MDI or MDI-X status/control - if MDI/MDI_X/AUTO is set then
+ * the driver is required to renegotiate link
+ */
+#define ETH_TP_MDI_INVALID 0x00 /* status: unknown; control: unsupported */
+#define ETH_TP_MDI 0x01 /* status: MDI; control: force MDI */
+#define ETH_TP_MDI_X 0x02 /* status: MDI-X; control: force MDI-X */
+#define ETH_TP_MDI_AUTO 0x03 /* control: auto-select */
+
+/* Wake-On-Lan options. */
+#define WAKE_PHY (1 << 0)
+#define WAKE_UCAST (1 << 1)
+#define WAKE_MCAST (1 << 2)
+#define WAKE_BCAST (1 << 3)
+#define WAKE_ARP (1 << 4)
+#define WAKE_MAGIC (1 << 5)
+#define WAKE_MAGICSECURE (1 << 6) /* only meaningful if WAKE_MAGIC */
+#define WAKE_FILTER (1 << 7)
+
+#define WOL_MODE_COUNT 8
+
+/* L2-L4 network traffic flow types */
+#define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */
+#define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */
+#define SCTP_V4_FLOW 0x03 /* hash or spec (sctp_ip4_spec) */
+#define AH_ESP_V4_FLOW 0x04 /* hash only */
+#define TCP_V6_FLOW 0x05 /* hash or spec (tcp_ip6_spec; nfc only) */
+#define UDP_V6_FLOW 0x06 /* hash or spec (udp_ip6_spec; nfc only) */
+#define SCTP_V6_FLOW 0x07 /* hash or spec (sctp_ip6_spec; nfc only) */
+#define AH_ESP_V6_FLOW 0x08 /* hash only */
+#define AH_V4_FLOW 0x09 /* hash or spec (ah_ip4_spec) */
+#define ESP_V4_FLOW 0x0a /* hash or spec (esp_ip4_spec) */
+#define AH_V6_FLOW 0x0b /* hash or spec (ah_ip6_spec; nfc only) */
+#define ESP_V6_FLOW 0x0c /* hash or spec (esp_ip6_spec; nfc only) */
+#define IPV4_USER_FLOW 0x0d /* spec only (usr_ip4_spec) */
+#define IP_USER_FLOW IPV4_USER_FLOW
+#define IPV6_USER_FLOW 0x0e /* spec only (usr_ip6_spec; nfc only) */
+#define IPV4_FLOW 0x10 /* hash only */
+#define IPV6_FLOW 0x11 /* hash only */
+#define ETHER_FLOW 0x12 /* spec only (ether_spec) */
+/* Flag to enable additional fields in struct ethtool_rx_flow_spec */
+#define FLOW_EXT 0x80000000
+#define FLOW_MAC_EXT 0x40000000
+/* Flag to enable RSS spreading of traffic matching rule (nfc only) */
+#define FLOW_RSS 0x20000000
+
+/* L3-L4 network traffic flow hash options */
+#define RXH_L2DA (1 << 1)
+#define RXH_VLAN (1 << 2)
+#define RXH_L3_PROTO (1 << 3)
+#define RXH_IP_SRC (1 << 4)
+#define RXH_IP_DST (1 << 5)
+#define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */
+#define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */
+#define RXH_DISCARD (1 << 31)
+
+#define RX_CLS_FLOW_DISC 0xffffffffffffffffULL
+#define RX_CLS_FLOW_WAKE 0xfffffffffffffffeULL
+
+/* Special RX classification rule insert location values */
+#define RX_CLS_LOC_SPECIAL 0x80000000 /* flag */
+#define RX_CLS_LOC_ANY 0xffffffff
+#define RX_CLS_LOC_FIRST 0xfffffffe
+#define RX_CLS_LOC_LAST 0xfffffffd
+
+/* EEPROM Standards for plug in modules */
+#define ETH_MODULE_SFF_8079 0x1
+#define ETH_MODULE_SFF_8079_LEN 256
+#define ETH_MODULE_SFF_8472 0x2
+#define ETH_MODULE_SFF_8472_LEN 512
+#define ETH_MODULE_SFF_8636 0x3
+#define ETH_MODULE_SFF_8636_LEN 256
+#define ETH_MODULE_SFF_8436 0x4
+#define ETH_MODULE_SFF_8436_LEN 256
+
+#define ETH_MODULE_SFF_8636_MAX_LEN 640
+#define ETH_MODULE_SFF_8436_MAX_LEN 640
+
+/* Reset flags */
+/* The reset() operation must clear the flags for the components which
+ * were actually reset. On successful return, the flags indicate the
+ * components which were not reset, either because they do not exist
+ * in the hardware or because they cannot be reset independently. The
+ * driver must never reset any components that were not requested.
+ */
+enum ethtool_reset_flags {
+ /* These flags represent components dedicated to the interface
+ * the command is addressed to. Shift any flag left by
+ * ETH_RESET_SHARED_SHIFT to reset a shared component of the
+ * same type.
+ */
+ ETH_RESET_MGMT = 1 << 0, /* Management processor */
+ ETH_RESET_IRQ = 1 << 1, /* Interrupt requester */
+ ETH_RESET_DMA = 1 << 2, /* DMA engine */
+ ETH_RESET_FILTER = 1 << 3, /* Filtering/flow direction */
+ ETH_RESET_OFFLOAD = 1 << 4, /* Protocol offload */
+ ETH_RESET_MAC = 1 << 5, /* Media access controller */
+ ETH_RESET_PHY = 1 << 6, /* Transceiver/PHY */
+ ETH_RESET_RAM = 1 << 7, /* RAM shared between
+ * multiple components */
+ ETH_RESET_AP = 1 << 8, /* Application processor */
+
+ ETH_RESET_DEDICATED = 0x0000ffff, /* All components dedicated to
+ * this interface */
+ ETH_RESET_ALL = 0xffffffff, /* All components used by this
+ * interface, even if shared */
+};
+#define ETH_RESET_SHARED_SHIFT 16
+
+
+/**
+ * struct ethtool_link_settings - link control and status
+ *
+ * IMPORTANT, Backward compatibility notice: When implementing new
+ * user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and
+ * if it succeeds use %ETHTOOL_SLINKSETTINGS to change link
+ * settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS
+ * succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in
+ * that case. Conversely, if %ETHTOOL_GLINKSETTINGS fails, use
+ * %ETHTOOL_GSET to query and %ETHTOOL_SSET to change link
+ * settings; do not use %ETHTOOL_SLINKSETTINGS if
+ * %ETHTOOL_GLINKSETTINGS failed: stick to
+ * %ETHTOOL_GSET/%ETHTOOL_SSET in that case.
+ *
+ * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS
+ * @speed: Link speed (Mbps)
+ * @duplex: Duplex mode; one of %DUPLEX_*
+ * @port: Physical connector type; one of %PORT_*
+ * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
+ * applicable. For clause 45 PHYs this is the PRTAD.
+ * @autoneg: Enable/disable autonegotiation and auto-detection;
+ * either %AUTONEG_DISABLE or %AUTONEG_ENABLE
+ * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO
+ * protocols supported by the interface; 0 if unknown.
+ * Read-only.
+ * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
+ * %ETH_TP_MDI_*. If the status is unknown or not applicable, the
+ * value will be %ETH_TP_MDI_INVALID. Read-only.
+ * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of
+ * %ETH_TP_MDI_*. If MDI(-X) control is not implemented, reads
+ * yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected.
+ * When written successfully, the link should be renegotiated if
+ * necessary.
+ * @link_mode_masks_nwords: Number of 32-bit words for each of the
+ * supported, advertising, lp_advertising link mode bitmaps. For
+ * %ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user
+ * (>= 0); on return, if handshake in progress, negative if
+ * request size unsupported by kernel: absolute value indicates
+ * kernel expected size and all the other fields but cmd
+ * are 0; otherwise (handshake completed), strictly positive
+ * to indicate size used by kernel and cmd field stays
+ * %ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For
+ * %ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive
+ * value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise
+ * refused. For drivers: ignore this field (use kernel's
+ * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will
+ * be overwritten by kernel.
+ * @supported: Bitmap with each bit meaning given by
+ * %ethtool_link_mode_bit_indices for the link modes, physical
+ * connectors and other link features for which the interface
+ * supports autonegotiation or auto-detection. Read-only.
+ * @advertising: Bitmap with each bit meaning given by
+ * %ethtool_link_mode_bit_indices for the link modes, physical
+ * connectors and other link features that are advertised through
+ * autonegotiation or enabled for auto-detection.
+ * @lp_advertising: Bitmap with each bit meaning given by
+ * %ethtool_link_mode_bit_indices for the link modes, and other
+ * link features that the link partner advertised through
+ * autonegotiation; 0 if unknown or not applicable. Read-only.
+ * @transceiver: Used to distinguish different possible PHY types,
+ * reported consistently by PHYLIB. Read-only.
+ * @master_slave_cfg: Master/slave port mode.
+ * @master_slave_state: Master/slave port state.
+ * @rate_matching: Rate adaptation performed by the PHY
+ * @reserved: Reserved for future use; see the note on reserved space.
+ * @link_mode_masks: Variable length bitmaps.
+ *
+ * If autonegotiation is disabled, the speed and @duplex represent the
+ * fixed link mode and are writable if the driver supports multiple
+ * link modes. If it is enabled then they are read-only; if the link
+ * is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
+ * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
+ *
+ * Some hardware interfaces may have multiple PHYs and/or physical
+ * connectors fitted or do not allow the driver to detect which are
+ * fitted. For these interfaces @port and/or @phy_address may be
+ * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE.
+ * Otherwise, attempts to write different values may be ignored or
+ * rejected.
+ *
+ * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
+ * are not available in %ethtool_link_settings. These fields will be
+ * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will
+ * fail if any of them is set to non-zero value.
+ *
+ * Users should assume that all fields not marked read-only are
+ * writable and subject to validation by the driver. They should use
+ * %ETHTOOL_GLINKSETTINGS to get the current values before making specific
+ * changes and then applying them with %ETHTOOL_SLINKSETTINGS.
+ *
+ * Drivers that implement %get_link_ksettings and/or
+ * %set_link_ksettings should ignore the @cmd
+ * and @link_mode_masks_nwords fields (any change to them overwritten
+ * by kernel), and rely only on kernel's internal
+ * %__ETHTOOL_LINK_MODE_MASK_NBITS and
+ * %ethtool_link_mode_mask_t. Drivers that implement
+ * %set_link_ksettings() should validate all fields other than @cmd
+ * and @link_mode_masks_nwords that are not described as read-only or
+ * deprecated, and must ignore all fields described as read-only.
+ */
+struct ethtool_link_settings {
+ __u32 cmd;
+ __u32 speed;
+ __u8 duplex;
+ __u8 port;
+ __u8 phy_address;
+ __u8 autoneg;
+ __u8 mdio_support;
+ __u8 eth_tp_mdix;
+ __u8 eth_tp_mdix_ctrl;
+ __s8 link_mode_masks_nwords;
+ __u8 transceiver;
+ __u8 master_slave_cfg;
+ __u8 master_slave_state;
+ __u8 rate_matching;
+ __u32 reserved[7];
+ __u32 link_mode_masks[];
+ /* layout of link_mode_masks fields:
+ * __u32 map_supported[link_mode_masks_nwords];
+ * __u32 map_advertising[link_mode_masks_nwords];
+ * __u32 map_lp_advertising[link_mode_masks_nwords];
+ */
+};
+#endif /* _LINUX_ETHTOOL_H */
diff --git a/src/shared/local-addresses.c b/src/shared/local-addresses.c
new file mode 100644
index 0000000..a1577de
--- /dev/null
+++ b/src/shared/local-addresses.c
@@ -0,0 +1,506 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <net/if_arp.h>
+
+#include "sd-netlink.h"
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "local-addresses.h"
+#include "macro.h"
+#include "netlink-util.h"
+#include "sort-util.h"
+
+static int address_compare(const struct local_address *a, const struct local_address *b) {
+ int r;
+
+ /* Order lowest scope first, IPv4 before IPv6, lowest interface index first */
+
+ if (a->family == AF_INET && b->family == AF_INET6)
+ return -1;
+ if (a->family == AF_INET6 && b->family == AF_INET)
+ return 1;
+
+ r = CMP(a->scope, b->scope);
+ if (r != 0)
+ return r;
+
+ r = CMP(a->metric, b->metric);
+ if (r != 0)
+ return r;
+
+ r = CMP(a->ifindex, b->ifindex);
+ if (r != 0)
+ return r;
+
+ return memcmp(&a->address, &b->address, FAMILY_ADDRESS_SIZE(a->family));
+}
+
+static void suppress_duplicates(struct local_address *list, size_t *n_list) {
+ size_t old_size, new_size;
+
+ /* Removes duplicate entries, assumes the list of addresses is already sorted. Updates in-place. */
+
+ if (*n_list < 2) /* list with less than two entries can't have duplicates */
+ return;
+
+ old_size = *n_list;
+ new_size = 1;
+
+ for (size_t i = 1; i < old_size; i++) {
+
+ if (address_compare(list + i, list + new_size - 1) == 0)
+ continue;
+
+ list[new_size++] = list[i];
+ }
+
+ *n_list = new_size;
+}
+
+int local_addresses(
+ sd_netlink *context,
+ int ifindex,
+ int af,
+ struct local_address **ret) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ _cleanup_free_ struct local_address *list = NULL;
+ size_t n_list = 0;
+ int r;
+
+ if (context)
+ rtnl = sd_netlink_ref(context);
+ else {
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_rtnl_message_new_addr(rtnl, &req, RTM_GETADDR, ifindex, af);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_set_request_dump(req, true);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call(rtnl, req, 0, &reply);
+ if (r < 0)
+ return r;
+
+ for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) {
+ struct local_address *a;
+ unsigned char flags;
+ uint16_t type;
+ int ifi, family;
+
+ r = sd_netlink_message_get_errno(m);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_get_type(m, &type);
+ if (r < 0)
+ return r;
+ if (type != RTM_NEWADDR)
+ continue;
+
+ r = sd_rtnl_message_addr_get_ifindex(m, &ifi);
+ if (r < 0)
+ return r;
+ if (ifindex > 0 && ifi != ifindex)
+ continue;
+
+ r = sd_rtnl_message_addr_get_family(m, &family);
+ if (r < 0)
+ return r;
+ if (af != AF_UNSPEC && af != family)
+ continue;
+
+ r = sd_rtnl_message_addr_get_flags(m, &flags);
+ if (r < 0)
+ return r;
+ if (flags & IFA_F_DEPRECATED)
+ continue;
+
+ if (!GREEDY_REALLOC0(list, n_list+1))
+ return -ENOMEM;
+
+ a = list + n_list;
+
+ r = sd_rtnl_message_addr_get_scope(m, &a->scope);
+ if (r < 0)
+ return r;
+
+ if (ifindex == 0 && IN_SET(a->scope, RT_SCOPE_HOST, RT_SCOPE_NOWHERE))
+ continue;
+
+ switch (family) {
+
+ case AF_INET:
+ r = sd_netlink_message_read_in_addr(m, IFA_LOCAL, &a->address.in);
+ if (r < 0) {
+ r = sd_netlink_message_read_in_addr(m, IFA_ADDRESS, &a->address.in);
+ if (r < 0)
+ continue;
+ }
+ break;
+
+ case AF_INET6:
+ r = sd_netlink_message_read_in6_addr(m, IFA_LOCAL, &a->address.in6);
+ if (r < 0) {
+ r = sd_netlink_message_read_in6_addr(m, IFA_ADDRESS, &a->address.in6);
+ if (r < 0)
+ continue;
+ }
+ break;
+
+ default:
+ continue;
+ }
+
+ a->ifindex = ifi;
+ a->family = family;
+
+ n_list++;
+ };
+
+ if (ret) {
+ typesafe_qsort(list, n_list, address_compare);
+ suppress_duplicates(list, &n_list);
+ *ret = TAKE_PTR(list);
+ }
+
+ return (int) n_list;
+}
+
+static int add_local_gateway(
+ struct local_address **list,
+ size_t *n_list,
+ int af,
+ int ifindex,
+ uint32_t metric,
+ const RouteVia *via) {
+
+ assert(list);
+ assert(n_list);
+ assert(via);
+
+ if (af != AF_UNSPEC && af != via->family)
+ return 0;
+
+ if (!GREEDY_REALLOC(*list, *n_list + 1))
+ return -ENOMEM;
+
+ (*list)[(*n_list)++] = (struct local_address) {
+ .ifindex = ifindex,
+ .metric = metric,
+ .family = via->family,
+ .address = via->address,
+ };
+
+ return 0;
+}
+
+int local_gateways(
+ sd_netlink *context,
+ int ifindex,
+ int af,
+ struct local_address **ret) {
+
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL;
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ _cleanup_free_ struct local_address *list = NULL;
+ size_t n_list = 0;
+ int r;
+
+ if (context)
+ rtnl = sd_netlink_ref(context);
+ else {
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_rtnl_message_new_route(rtnl, &req, RTM_GETROUTE, af, RTPROT_UNSPEC);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_route_set_type(req, RTN_UNICAST);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_route_set_table(req, RT_TABLE_MAIN);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_set_request_dump(req, true);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call(rtnl, req, 0, &reply);
+ if (r < 0)
+ return r;
+
+ for (sd_netlink_message *m = reply; m; m = sd_netlink_message_next(m)) {
+ _cleanup_ordered_set_free_free_ OrderedSet *multipath_routes = NULL;
+ _cleanup_free_ void *rta_multipath = NULL;
+ union in_addr_union gateway;
+ uint16_t type;
+ unsigned char dst_len, src_len, table;
+ uint32_t ifi = 0, metric = 0;
+ size_t rta_len;
+ int family;
+ RouteVia via;
+
+ r = sd_netlink_message_get_errno(m);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_get_type(m, &type);
+ if (r < 0)
+ return r;
+ if (type != RTM_NEWROUTE)
+ continue;
+
+ /* We only care for default routes */
+ r = sd_rtnl_message_route_get_dst_prefixlen(m, &dst_len);
+ if (r < 0)
+ return r;
+ if (dst_len != 0)
+ continue;
+
+ r = sd_rtnl_message_route_get_src_prefixlen(m, &src_len);
+ if (r < 0)
+ return r;
+ if (src_len != 0)
+ continue;
+
+ r = sd_rtnl_message_route_get_table(m, &table);
+ if (r < 0)
+ return r;
+ if (table != RT_TABLE_MAIN)
+ continue;
+
+ r = sd_netlink_message_read_u32(m, RTA_PRIORITY, &metric);
+ if (r < 0 && r != -ENODATA)
+ return r;
+
+ r = sd_rtnl_message_route_get_family(m, &family);
+ if (r < 0)
+ return r;
+ if (!IN_SET(family, AF_INET, AF_INET6))
+ continue;
+
+ r = sd_netlink_message_read_u32(m, RTA_OIF, &ifi);
+ if (r < 0 && r != -ENODATA)
+ return r;
+ if (r >= 0) {
+ if (ifi <= 0)
+ return -EINVAL;
+ if (ifindex > 0 && (int) ifi != ifindex)
+ continue;
+
+ r = netlink_message_read_in_addr_union(m, RTA_GATEWAY, family, &gateway);
+ if (r < 0 && r != -ENODATA)
+ return r;
+ if (r >= 0) {
+ via.family = family;
+ via.address = gateway;
+ r = add_local_gateway(&list, &n_list, af, ifi, metric, &via);
+ if (r < 0)
+ return r;
+
+ continue;
+ }
+
+ if (family != AF_INET)
+ continue;
+
+ r = sd_netlink_message_read(m, RTA_VIA, sizeof(via), &via);
+ if (r < 0 && r != -ENODATA)
+ return r;
+ if (r >= 0) {
+ r = add_local_gateway(&list, &n_list, af, ifi, metric, &via);
+ if (r < 0)
+ return r;
+
+ continue;
+ }
+ }
+
+ r = sd_netlink_message_read_data(m, RTA_MULTIPATH, &rta_len, &rta_multipath);
+ if (r < 0 && r != -ENODATA)
+ return r;
+ if (r >= 0) {
+ MultipathRoute *mr;
+
+ r = rtattr_read_nexthop(rta_multipath, rta_len, family, &multipath_routes);
+ if (r < 0)
+ return r;
+
+ ORDERED_SET_FOREACH(mr, multipath_routes) {
+ if (ifindex > 0 && mr->ifindex != ifindex)
+ continue;
+
+ r = add_local_gateway(&list, &n_list, af, ifi, metric, &mr->gateway);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+
+ if (ret) {
+ typesafe_qsort(list, n_list, address_compare);
+ suppress_duplicates(list, &n_list);
+ *ret = TAKE_PTR(list);
+ }
+
+ return (int) n_list;
+}
+
+int local_outbounds(
+ sd_netlink *context,
+ int ifindex,
+ int af,
+ struct local_address **ret) {
+
+ _cleanup_free_ struct local_address *list = NULL, *gateways = NULL;
+ size_t n_list = 0;
+ int r, n_gateways;
+
+ /* Determines our default outbound addresses, i.e. the "primary" local addresses we use to talk to IP
+ * addresses behind the default routes. This is still an address of the local host (i.e. this doesn't
+ * resolve NAT or so), but it's the set of addresses the local IP stack most likely uses to talk to
+ * other hosts.
+ *
+ * This works by connect()ing a SOCK_DGRAM socket to the local gateways, and then reading the IP
+ * address off the socket that was chosen for the routing decision. */
+
+ n_gateways = local_gateways(context, ifindex, af, &gateways);
+ if (n_gateways < 0)
+ return n_gateways;
+ if (n_gateways == 0) {
+ /* No gateways? Then we have no outbound addresses either. */
+ if (ret)
+ *ret = NULL;
+
+ return 0;
+ }
+
+ for (int i = 0; i < n_gateways; i++) {
+ _cleanup_close_ int fd = -EBADF;
+ union sockaddr_union sa;
+ socklen_t salen;
+
+ fd = socket(gateways[i].family, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return -errno;
+
+ switch (gateways[i].family) {
+
+ case AF_INET:
+ sa.in = (struct sockaddr_in) {
+ .sin_family = AF_INET,
+ .sin_addr = gateways[i].address.in,
+ .sin_port = htobe16(53), /* doesn't really matter which port we pick —
+ * we just care about the routing decision */
+ };
+
+ break;
+
+ case AF_INET6:
+ sa.in6 = (struct sockaddr_in6) {
+ .sin6_family = AF_INET6,
+ .sin6_addr = gateways[i].address.in6,
+ .sin6_port = htobe16(53),
+ .sin6_scope_id = gateways[i].ifindex,
+ };
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ /* So ideally we'd just use IP_UNICAST_IF here to pass the ifindex info to the kernel before
+ * connect()ing, sot that it influences the routing decision. However, on current kernels
+ * IP_UNICAST_IF doesn't actually influence the routing decision for UDP — which I think
+ * should probably just be considered a bug. Once that bug is fixed this is the best API to
+ * use, since it is the most lightweight. */
+ r = socket_set_unicast_if(fd, gateways[i].family, gateways[i].ifindex);
+ if (r < 0)
+ log_debug_errno(r, "Failed to set unicast interface index %i, ignoring: %m", gateways[i].ifindex);
+
+ /* We'll also use SO_BINDTOINDEX. This requires CAP_NET_RAW on old kernels, hence there's a
+ * good chance this fails. Since 5.7 this restriction was dropped and the first
+ * SO_BINDTOINDEX on a socket may be done without privileges. This one has the benefit of
+ * really influencing the routing decision, i.e. this one definitely works for us — as long
+ * as we have the privileges for it. */
+ r = socket_bind_to_ifindex(fd, gateways[i].ifindex);
+ if (r < 0)
+ log_debug_errno(r, "Failed to bind socket to interface %i, ignoring: %m", gateways[i].ifindex);
+
+ /* Let's now connect() to the UDP socket, forcing the kernel to make a routing decision and
+ * auto-bind the socket. We ignore failures on this, since that failure might happen for a
+ * multitude of reasons (policy/firewall issues, who knows?) and some of them might be
+ * *after* the routing decision and the auto-binding already took place. If so we can still
+ * make use of the binding and return it. Hence, let's not unnecessarily fail early here: we
+ * can still easily detect if the auto-binding worked or not, by comparing the bound IP
+ * address with zero — which we do below. */
+ if (connect(fd, &sa.sa, SOCKADDR_LEN(sa)) < 0)
+ log_debug_errno(errno, "Failed to connect SOCK_DGRAM socket to gateway, ignoring: %m");
+
+ /* Let's now read the socket address of the socket. A routing decision should have been
+ * made. Let's verify that and use the data. */
+ salen = SOCKADDR_LEN(sa);
+ if (getsockname(fd, &sa.sa, &salen) < 0)
+ return -errno;
+ assert(sa.sa.sa_family == gateways[i].family);
+ assert(salen == SOCKADDR_LEN(sa));
+
+ switch (gateways[i].family) {
+
+ case AF_INET:
+ if (in4_addr_is_null(&sa.in.sin_addr)) /* Auto-binding didn't work. :-( */
+ continue;
+
+ if (!GREEDY_REALLOC(list, n_list+1))
+ return -ENOMEM;
+
+ list[n_list++] = (struct local_address) {
+ .family = gateways[i].family,
+ .ifindex = gateways[i].ifindex,
+ .address.in = sa.in.sin_addr,
+ };
+
+ break;
+
+ case AF_INET6:
+ if (in6_addr_is_null(&sa.in6.sin6_addr))
+ continue;
+
+ if (!GREEDY_REALLOC(list, n_list+1))
+ return -ENOMEM;
+
+ list[n_list++] = (struct local_address) {
+ .family = gateways[i].family,
+ .ifindex = gateways[i].ifindex,
+ .address.in6 = sa.in6.sin6_addr,
+ };
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ if (ret) {
+ typesafe_qsort(list, n_list, address_compare);
+ suppress_duplicates(list, &n_list);
+ *ret = TAKE_PTR(list);
+ }
+
+ return (int) n_list;
+}
diff --git a/src/shared/local-addresses.h b/src/shared/local-addresses.h
new file mode 100644
index 0000000..38a17d2
--- /dev/null
+++ b/src/shared/local-addresses.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-netlink.h"
+
+#include "in-addr-util.h"
+
+struct local_address {
+ int family, ifindex;
+ unsigned char scope;
+ uint32_t metric;
+ union in_addr_union address;
+};
+
+int local_addresses(sd_netlink *rtnl, int ifindex, int af, struct local_address **ret);
+
+int local_gateways(sd_netlink *rtnl, int ifindex, int af, struct local_address **ret);
+
+int local_outbounds(sd_netlink *rtnl, int ifindex, int af, struct local_address **ret);
diff --git a/src/shared/locale-setup.c b/src/shared/locale-setup.c
new file mode 100644
index 0000000..4e7f486
--- /dev/null
+++ b/src/shared/locale-setup.c
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sys/stat.h>
+
+#include "env-file-label.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "locale-setup.h"
+#include "proc-cmdline.h"
+#include "stat-util.h"
+#include "strv.h"
+
+void locale_context_clear(LocaleContext *c) {
+ assert(c);
+
+ c->st = (struct stat) {};
+
+ for (LocaleVariable i = 0; i < _VARIABLE_LC_MAX; i++)
+ c->locale[i] = mfree(c->locale[i]);
+}
+
+static int locale_context_load_proc(LocaleContext *c, LocaleLoadFlag flag) {
+ int r;
+
+ assert(c);
+
+ if (!FLAGS_SET(flag, LOCALE_LOAD_PROC_CMDLINE))
+ return 0;
+
+ locale_context_clear(c);
+
+ r = proc_cmdline_get_key_many(PROC_CMDLINE_STRIP_RD_PREFIX,
+ "locale.LANG", &c->locale[VARIABLE_LANG],
+ "locale.LANGUAGE", &c->locale[VARIABLE_LANGUAGE],
+ "locale.LC_CTYPE", &c->locale[VARIABLE_LC_CTYPE],
+ "locale.LC_NUMERIC", &c->locale[VARIABLE_LC_NUMERIC],
+ "locale.LC_TIME", &c->locale[VARIABLE_LC_TIME],
+ "locale.LC_COLLATE", &c->locale[VARIABLE_LC_COLLATE],
+ "locale.LC_MONETARY", &c->locale[VARIABLE_LC_MONETARY],
+ "locale.LC_MESSAGES", &c->locale[VARIABLE_LC_MESSAGES],
+ "locale.LC_PAPER", &c->locale[VARIABLE_LC_PAPER],
+ "locale.LC_NAME", &c->locale[VARIABLE_LC_NAME],
+ "locale.LC_ADDRESS", &c->locale[VARIABLE_LC_ADDRESS],
+ "locale.LC_TELEPHONE", &c->locale[VARIABLE_LC_TELEPHONE],
+ "locale.LC_MEASUREMENT", &c->locale[VARIABLE_LC_MEASUREMENT],
+ "locale.LC_IDENTIFICATION", &c->locale[VARIABLE_LC_IDENTIFICATION]);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read /proc/cmdline: %m");
+ return r;
+}
+
+static int locale_context_load_conf(LocaleContext *c, LocaleLoadFlag flag) {
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+ int r;
+
+ assert(c);
+
+ if (!FLAGS_SET(flag, LOCALE_LOAD_LOCALE_CONF))
+ return 0;
+
+ fd = RET_NERRNO(open("/etc/locale.conf", O_CLOEXEC | O_PATH));
+ if (fd == -ENOENT)
+ return 0;
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to open /etc/locale.conf: %m");
+
+ if (fstat(fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to stat /etc/locale.conf: %m");
+
+ /* If the file is not changed, then we do not need to re-read the file. */
+ if (stat_inode_unmodified(&c->st, &st))
+ return 0;
+
+ c->st = st;
+ locale_context_clear(c);
+
+ r = parse_env_file_fd(fd, "/etc/locale.conf",
+ "LANG", &c->locale[VARIABLE_LANG],
+ "LANGUAGE", &c->locale[VARIABLE_LANGUAGE],
+ "LC_CTYPE", &c->locale[VARIABLE_LC_CTYPE],
+ "LC_NUMERIC", &c->locale[VARIABLE_LC_NUMERIC],
+ "LC_TIME", &c->locale[VARIABLE_LC_TIME],
+ "LC_COLLATE", &c->locale[VARIABLE_LC_COLLATE],
+ "LC_MONETARY", &c->locale[VARIABLE_LC_MONETARY],
+ "LC_MESSAGES", &c->locale[VARIABLE_LC_MESSAGES],
+ "LC_PAPER", &c->locale[VARIABLE_LC_PAPER],
+ "LC_NAME", &c->locale[VARIABLE_LC_NAME],
+ "LC_ADDRESS", &c->locale[VARIABLE_LC_ADDRESS],
+ "LC_TELEPHONE", &c->locale[VARIABLE_LC_TELEPHONE],
+ "LC_MEASUREMENT", &c->locale[VARIABLE_LC_MEASUREMENT],
+ "LC_IDENTIFICATION", &c->locale[VARIABLE_LC_IDENTIFICATION]);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read /etc/locale.conf: %m");
+
+ return 1; /* loaded */
+}
+
+static int locale_context_load_env(LocaleContext *c, LocaleLoadFlag flag) {
+ int r;
+
+ assert(c);
+
+ if (!FLAGS_SET(flag, LOCALE_LOAD_ENVIRONMENT))
+ return 0;
+
+ locale_context_clear(c);
+
+ /* Fill in what we got passed from systemd. */
+ for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) {
+ const char *name = ASSERT_PTR(locale_variable_to_string(p));
+
+ r = free_and_strdup(&c->locale[p], empty_to_null(getenv(name)));
+ if (r < 0)
+ return log_oom_debug();
+ }
+
+ return 1; /* loaded */
+}
+
+int locale_context_load(LocaleContext *c, LocaleLoadFlag flag) {
+ int r;
+
+ assert(c);
+
+ r = locale_context_load_proc(c, flag);
+ if (r > 0)
+ goto finalize;
+
+ r = locale_context_load_conf(c, flag);
+ if (r != 0)
+ goto finalize;
+
+ r = locale_context_load_env(c, flag);
+
+finalize:
+ if (r <= 0) {
+ /* Nothing loaded, or error. */
+ locale_context_clear(c);
+ return r;
+ }
+
+ if (FLAGS_SET(flag, LOCALE_LOAD_SIMPLIFY))
+ locale_variables_simplify(c->locale);
+
+ return 0;
+}
+
+int locale_context_build_env(const LocaleContext *c, char ***ret_set, char ***ret_unset) {
+ _cleanup_strv_free_ char **set = NULL, **unset = NULL;
+ int r;
+
+ assert(c);
+
+ if (!ret_set && !ret_unset)
+ return 0;
+
+ for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++) {
+ const char *name = ASSERT_PTR(locale_variable_to_string(p));
+
+ if (isempty(c->locale[p])) {
+ if (!ret_unset)
+ continue;
+ r = strv_extend(&unset, name);
+ } else {
+ if (!ret_set)
+ continue;
+ r = strv_env_assign(&set, name, c->locale[p]);
+ }
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_set)
+ *ret_set = TAKE_PTR(set);
+ if (ret_unset)
+ *ret_unset = TAKE_PTR(unset);
+ return 0;
+}
+
+int locale_context_save(LocaleContext *c, char ***ret_set, char ***ret_unset) {
+ _cleanup_strv_free_ char **set = NULL, **unset = NULL;
+ int r;
+
+ assert(c);
+
+ /* Set values will be returned as strv in *ret on success. */
+
+ r = locale_context_build_env(c, &set, ret_unset ? &unset : NULL);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(set)) {
+ if (unlink("/etc/locale.conf") < 0)
+ return errno == ENOENT ? 0 : -errno;
+
+ c->st = (struct stat) {};
+
+ if (ret_set)
+ *ret_set = NULL;
+ if (ret_unset)
+ *ret_unset = NULL;
+ return 0;
+ }
+
+ r = write_env_file_label(AT_FDCWD, "/etc/locale.conf", NULL, set);
+ if (r < 0)
+ return r;
+
+ if (stat("/etc/locale.conf", &c->st) < 0)
+ return -errno;
+
+ if (ret_set)
+ *ret_set = TAKE_PTR(set);
+ if (ret_unset)
+ *ret_unset = TAKE_PTR(unset);
+ return 0;
+}
+
+int locale_context_merge(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]) {
+ assert(c);
+ assert(l);
+
+ for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++)
+ if (!isempty(c->locale[p]) && isempty(l[p])) {
+ l[p] = strdup(c->locale[p]);
+ if (!l[p])
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void locale_context_take(LocaleContext *c, char *l[_VARIABLE_LC_MAX]) {
+ assert(c);
+ assert(l);
+
+ for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++)
+ free_and_replace(c->locale[p], l[p]);
+}
+
+bool locale_context_equal(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]) {
+ assert(c);
+ assert(l);
+
+ for (LocaleVariable p = 0; p < _VARIABLE_LC_MAX; p++)
+ if (!streq_ptr(c->locale[p], l[p]))
+ return false;
+
+ return true;
+}
+
+int locale_setup(char ***environment) {
+ _cleanup_(locale_context_clear) LocaleContext c = {};
+ _cleanup_strv_free_ char **add = NULL;
+ int r;
+
+ assert(environment);
+
+ r = locale_context_load(&c, LOCALE_LOAD_PROC_CMDLINE | LOCALE_LOAD_LOCALE_CONF);
+ if (r < 0)
+ return r;
+
+ r = locale_context_build_env(&c, &add, NULL);
+ if (r < 0)
+ return r;
+
+ if (strv_isempty(add)) {
+ /* If no locale is configured then default to compile-time default. */
+
+ add = strv_new("LANG=" SYSTEMD_DEFAULT_LOCALE);
+ if (!add)
+ return -ENOMEM;
+ }
+
+ if (strv_isempty(*environment))
+ strv_free_and_replace(*environment, add);
+ else {
+ char **merged;
+
+ merged = strv_env_merge(*environment, add);
+ if (!merged)
+ return -ENOMEM;
+
+ strv_free_and_replace(*environment, merged);
+ }
+
+ return 0;
+}
diff --git a/src/shared/locale-setup.h b/src/shared/locale-setup.h
new file mode 100644
index 0000000..537acc7
--- /dev/null
+++ b/src/shared/locale-setup.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/stat.h>
+
+#include "locale-util.h"
+
+typedef struct LocaleContext {
+ struct stat st;
+ char *locale[_VARIABLE_LC_MAX];
+} LocaleContext;
+
+typedef enum LocaleLoadFlag {
+ LOCALE_LOAD_PROC_CMDLINE = 1 << 0,
+ LOCALE_LOAD_LOCALE_CONF = 1 << 1,
+ LOCALE_LOAD_ENVIRONMENT = 1 << 2,
+ LOCALE_LOAD_SIMPLIFY = 1 << 3,
+} LocaleLoadFlag;
+
+void locale_context_clear(LocaleContext *c);
+int locale_context_load(LocaleContext *c, LocaleLoadFlag flag);
+int locale_context_build_env(const LocaleContext *c, char ***ret_set, char ***ret_unset);
+int locale_context_save(LocaleContext *c, char ***ret_set, char ***ret_unset);
+
+int locale_context_merge(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]);
+void locale_context_take(LocaleContext *c, char *l[_VARIABLE_LC_MAX]);
+bool locale_context_equal(const LocaleContext *c, char *l[_VARIABLE_LC_MAX]);
+
+int locale_setup(char ***environment);
diff --git a/src/shared/log-link.h b/src/shared/log-link.h
new file mode 100644
index 0000000..5f2b176
--- /dev/null
+++ b/src/shared/log-link.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "log.h"
+
+#define log_interface_full_errno_zerook(ifname, level, error, ...) \
+ ({ \
+ const char *_ifname = (ifname); \
+ _ifname ? log_object_internal(level, error, PROJECT_FILE, __LINE__, __func__, "INTERFACE=", _ifname, NULL, NULL, ##__VA_ARGS__) : \
+ log_internal(level, error, PROJECT_FILE, __LINE__, __func__, ##__VA_ARGS__); \
+ })
+
+#define log_interface_full_errno(ifname, level, error, ...) \
+ ({ \
+ int _error = (error); \
+ ASSERT_NON_ZERO(_error); \
+ log_interface_full_errno_zerook(ifname, level, _error, __VA_ARGS__); \
+ })
+
+/*
+ * The following macros append INTERFACE= to the message.
+ * The macros require a struct named 'Link' which contains 'char *ifname':
+ *
+ * typedef struct Link {
+ * char *ifname;
+ * } Link;
+ *
+ * See, network/networkd-link.h for example.
+ */
+
+#define log_link_full_errno_zerook(link, level, error, ...) \
+ ({ \
+ const Link *_l = (link); \
+ log_interface_full_errno_zerook(_l ? _l->ifname : NULL, level, error, __VA_ARGS__); \
+ })
+
+#define log_link_full_errno(link, level, error, ...) \
+ ({ \
+ int _error = (error); \
+ ASSERT_NON_ZERO(_error); \
+ log_link_full_errno_zerook(link, level, _error, __VA_ARGS__); \
+ })
+
+#define log_link_full(link, level, ...) (void) log_link_full_errno_zerook(link, level, 0, __VA_ARGS__)
+
+#define log_link_debug(link, ...) log_link_full(link, LOG_DEBUG, __VA_ARGS__)
+#define log_link_info(link, ...) log_link_full(link, LOG_INFO, __VA_ARGS__)
+#define log_link_notice(link, ...) log_link_full(link, LOG_NOTICE, __VA_ARGS__)
+#define log_link_warning(link, ...) log_link_full(link, LOG_WARNING, __VA_ARGS__)
+#define log_link_error(link, ...) log_link_full(link, LOG_ERR, __VA_ARGS__)
+
+#define log_link_debug_errno(link, error, ...) log_link_full_errno(link, LOG_DEBUG, error, __VA_ARGS__)
+#define log_link_info_errno(link, error, ...) log_link_full_errno(link, LOG_INFO, error, __VA_ARGS__)
+#define log_link_notice_errno(link, error, ...) log_link_full_errno(link, LOG_NOTICE, error, __VA_ARGS__)
+#define log_link_warning_errno(link, error, ...) log_link_full_errno(link, LOG_WARNING, error, __VA_ARGS__)
+#define log_link_error_errno(link, error, ...) log_link_full_errno(link, LOG_ERR, error, __VA_ARGS__)
+
+#define LOG_LINK_MESSAGE(link, fmt, ...) "MESSAGE=%s: " fmt, (link)->ifname, ##__VA_ARGS__
+#define LOG_LINK_INTERFACE(link) "INTERFACE=%s", (link)->ifname
diff --git a/src/shared/logs-show.c b/src/shared/logs-show.c
new file mode 100644
index 0000000..a5d0400
--- /dev/null
+++ b/src/shared/logs-show.c
@@ -0,0 +1,2102 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <syslog.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+#include "sd-journal.h"
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "hostname-util.h"
+#include "id128-util.h"
+#include "io-util.h"
+#include "journal-internal.h"
+#include "journal-util.h"
+#include "json.h"
+#include "locale-util.h"
+#include "log.h"
+#include "logs-show.h"
+#include "macro.h"
+#include "namespace-util.h"
+#include "output-mode.h"
+#include "parse-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "sparse-endian.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "time-util.h"
+#include "utf8.h"
+#include "web-util.h"
+
+/* up to three lines (each up to 100 characters) or 300 characters, whichever is less */
+#define PRINT_LINE_THRESHOLD 3
+#define PRINT_CHAR_THRESHOLD 300
+
+#define JSON_THRESHOLD 4096U
+
+static int print_catalog(FILE *f, sd_journal *j) {
+ _cleanup_free_ char *t = NULL, *z = NULL;
+ const char *newline, *prefix;
+ int r;
+
+ assert(j);
+
+ r = sd_journal_get_catalog(j, &t);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to find catalog entry: %m");
+
+ if (is_locale_utf8())
+ prefix = strjoina(special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), special_glyph(SPECIAL_GLYPH_LIGHT_SHADE));
+ else
+ prefix = "--";
+
+ newline = strjoina(ansi_normal(), "\n", ansi_grey(), prefix, ansi_normal(), " ", ansi_green());
+
+ z = strreplace(strstrip(t), "\n", newline);
+ if (!z)
+ return log_oom();
+
+ fprintf(f, "%s%s %s%s", ansi_grey(), prefix, ansi_normal(), ansi_green());
+ fputs(z, f);
+ fprintf(f, "%s\n", ansi_normal());
+
+ return 1;
+}
+
+static int url_from_catalog(sd_journal *j, char **ret) {
+ _cleanup_free_ char *t = NULL, *url = NULL;
+ const char *weblink;
+ int r;
+
+ assert(j);
+ assert(ret);
+
+ r = sd_journal_get_catalog(j, &t);
+ if (r == -ENOENT)
+ goto notfound;
+ if (r < 0)
+ return log_error_errno(r, "Failed to find catalog entry: %m");
+
+ weblink = find_line_startswith(t, "Documentation:");
+ if (!weblink)
+ goto notfound;
+
+ /* Skip whitespace to value */
+ weblink += strspn(weblink, " \t");
+
+ /* Cut out till next whitespace/newline */
+ url = strdupcspn(weblink, WHITESPACE);
+ if (!url)
+ return log_oom();
+
+ if (!documentation_url_is_valid(url))
+ goto notfound;
+
+ *ret = TAKE_PTR(url);
+ return 1;
+
+notfound:
+ *ret = NULL;
+ return 0;
+}
+
+static int parse_field(
+ const void *data,
+ size_t length,
+ const char *field,
+ size_t field_len,
+ char **target,
+ size_t *target_len) {
+
+ size_t nl;
+ char *buf;
+
+ assert(data);
+ assert(field);
+ assert(target);
+
+ if (length < field_len)
+ return 0;
+
+ if (memcmp(data, field, field_len))
+ return 0;
+
+ nl = length - field_len;
+
+ buf = newdup_suffix0(char, (const char*) data + field_len, nl);
+ if (!buf)
+ return log_oom();
+
+ free_and_replace(*target, buf);
+
+ if (target_len)
+ *target_len = nl;
+
+ return 1;
+}
+
+typedef struct ParseFieldVec {
+ const char *field;
+ size_t field_len;
+ char **target;
+ size_t *target_len;
+} ParseFieldVec;
+
+#define PARSE_FIELD_VEC_ENTRY(_field, _target, _target_len) { \
+ .field = _field, \
+ .field_len = strlen(_field), \
+ .target = _target, \
+ .target_len = _target_len \
+ }
+
+static int parse_fieldv(
+ const void *data,
+ size_t length,
+ const ParseFieldVec *fields,
+ size_t n_fields) {
+
+ int r;
+
+ for (size_t i = 0; i < n_fields; i++) {
+ const ParseFieldVec *f = &fields[i];
+
+ r = parse_field(data, length, f->field, f->field_len, f->target, f->target_len);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ break;
+ }
+
+ return 0;
+}
+
+static int field_set_test(const Set *fields, const char *name, size_t n) {
+ char *s;
+
+ if (!fields)
+ return 1;
+
+ s = strndupa_safe(name, n);
+ return set_contains(fields, s);
+}
+
+static bool shall_print(const char *p, size_t l, OutputFlags flags) {
+ assert(p);
+
+ if (flags & OUTPUT_SHOW_ALL)
+ return true;
+
+ if (l >= PRINT_CHAR_THRESHOLD)
+ return false;
+
+ if (!utf8_is_printable(p, l))
+ return false;
+
+ return true;
+}
+
+static bool print_multiline(
+ FILE *f,
+ unsigned prefix,
+ unsigned n_columns,
+ OutputFlags flags,
+ int priority,
+ bool audit,
+ const char* message,
+ size_t message_len,
+ size_t highlight[2]) {
+
+ const char *color_on = "", *color_off = "", *highlight_on = "";
+ const char *pos, *end;
+ bool ellipsized = false;
+ int line = 0;
+
+ if (flags & OUTPUT_COLOR) {
+ get_log_colors(priority, &color_on, &color_off, &highlight_on);
+
+ if (audit && strempty(color_on)) {
+ color_on = ANSI_BLUE;
+ color_off = ANSI_NORMAL;
+ }
+ }
+
+ /* A special case: make sure that we print a newline when
+ the message is empty. */
+ if (message_len == 0)
+ fputs("\n", f);
+
+ for (pos = message;
+ pos < message + message_len;
+ pos = end + 1, line++) {
+ bool tail_line;
+ int len, indent = (line > 0) * prefix;
+ for (end = pos; end < message + message_len && *end != '\n'; end++)
+ ;
+ len = end - pos;
+ assert(len >= 0);
+
+ /* We need to figure out when we are showing not-last line, *and*
+ * will skip subsequent lines. In that case, we will put the dots
+ * at the end of the line, instead of putting dots in the middle
+ * or not at all.
+ */
+ tail_line =
+ line + 1 == PRINT_LINE_THRESHOLD ||
+ end + 1 >= message + PRINT_CHAR_THRESHOLD;
+
+ if (flags & (OUTPUT_FULL_WIDTH | OUTPUT_SHOW_ALL) ||
+ (prefix + len + 1 < n_columns && !tail_line)) {
+ if (highlight &&
+ (size_t) (pos - message) <= highlight[0] &&
+ highlight[0] < (size_t) len) {
+
+ fprintf(f, "%*s%s%.*s",
+ indent, "",
+ color_on, (int) highlight[0], pos);
+ fprintf(f, "%s%.*s",
+ highlight_on,
+ (int) (MIN((size_t) len, highlight[1]) - highlight[0]),
+ pos + highlight[0]);
+ if ((size_t) len > highlight[1])
+ fprintf(f, "%s%.*s",
+ color_on,
+ (int) (len - highlight[1]),
+ pos + highlight[1]);
+ fprintf(f, "%s\n", color_off);
+
+ } else
+ fprintf(f, "%*s%s%.*s%s\n",
+ indent, "",
+ color_on, len, pos, color_off);
+ continue;
+ }
+
+ /* Beyond this point, ellipsization will happen. */
+ ellipsized = true;
+
+ if (prefix < n_columns && n_columns - prefix >= 3) {
+ if (n_columns - prefix > (unsigned) len + 3)
+ fprintf(f, "%*s%s%.*s...%s\n",
+ indent, "",
+ color_on, len, pos, color_off);
+ else {
+ _cleanup_free_ char *e = NULL;
+
+ e = ellipsize_mem(pos, len, n_columns - prefix,
+ tail_line ? 100 : 90);
+ if (!e)
+ fprintf(f, "%*s%s%.*s%s\n",
+ indent, "",
+ color_on, len, pos, color_off);
+ else
+ fprintf(f, "%*s%s%s%s\n",
+ indent, "",
+ color_on, e, color_off);
+ }
+ } else
+ fputs("...\n", f);
+
+ if (tail_line)
+ break;
+ }
+
+ return ellipsized;
+}
+
+static int output_timestamp_monotonic(
+ FILE *f,
+ OutputMode mode,
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id) {
+
+ int written_chars = 0;
+
+ assert(f);
+ assert(display_ts);
+ assert(boot_id);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ if (!VALID_MONOTONIC(display_ts->monotonic))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No valid monotonic timestamp available");
+
+ written_chars += fprintf(f, "[%5"PRI_USEC".%06"PRI_USEC, display_ts->monotonic / USEC_PER_SEC, display_ts->monotonic % USEC_PER_SEC);
+
+ if (mode == OUTPUT_SHORT_DELTA) {
+ uint64_t delta;
+ bool reliable_ts = true;
+
+ if (VALID_MONOTONIC(previous_display_ts->monotonic) && sd_id128_equal(*boot_id, *previous_boot_id))
+ delta = usec_sub_unsigned(display_ts->monotonic, previous_display_ts->monotonic);
+ else if (VALID_REALTIME(display_ts->realtime) && VALID_REALTIME(previous_display_ts->realtime)) {
+ delta = usec_sub_unsigned(display_ts->realtime, previous_display_ts->realtime);
+ reliable_ts = false;
+ } else {
+ written_chars += fprintf(f, "%16s", "");
+ goto finish;
+ }
+
+ written_chars += fprintf(f, " <%5"PRI_USEC".%06"PRI_USEC"%s>", delta / USEC_PER_SEC, delta % USEC_PER_SEC, reliable_ts ? " " : "*");
+ }
+
+finish:
+ written_chars += fprintf(f, "%s", "]");
+ return written_chars;
+}
+
+static int output_timestamp_realtime(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ OutputFlags flags,
+ const dual_timestamp *display_ts) {
+
+ char buf[CONST_MAX(FORMAT_TIMESTAMP_MAX, 64U)];
+ int r;
+
+ assert(f);
+ assert(j);
+ assert(display_ts);
+
+ if (!VALID_REALTIME(display_ts->realtime))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No valid realtime timestamp available");
+
+ if (IN_SET(mode, OUTPUT_SHORT_FULL, OUTPUT_WITH_UNIT)) {
+ const char *k;
+
+ if (flags & OUTPUT_UTC)
+ k = format_timestamp_style(buf, sizeof(buf), display_ts->realtime, TIMESTAMP_UTC);
+ else
+ k = format_timestamp(buf, sizeof(buf), display_ts->realtime);
+ if (!k)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to format timestamp: %" PRIu64, display_ts->realtime);
+
+ } else {
+ struct tm tm;
+ time_t t;
+
+ t = (time_t) (display_ts->realtime / USEC_PER_SEC);
+
+ switch (mode) {
+
+ case OUTPUT_SHORT_UNIX:
+ xsprintf(buf, "%10"PRI_TIME".%06"PRIu64, t, display_ts->realtime % USEC_PER_SEC);
+ break;
+
+ case OUTPUT_SHORT_ISO:
+ case OUTPUT_SHORT_ISO_PRECISE: {
+ size_t tail = strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S",
+ localtime_or_gmtime_r(&t, &tm, flags & OUTPUT_UTC));
+ if (tail == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to format ISO time");
+
+ /* No usec in strftime, need to append */
+ if (mode == OUTPUT_SHORT_ISO_PRECISE) {
+ assert(ELEMENTSOF(buf) - tail >= 7);
+ snprintf(buf + tail, ELEMENTSOF(buf) - tail, ".%06"PRI_USEC, display_ts->realtime % USEC_PER_SEC);
+ tail += 7;
+ }
+
+ int h = tm.tm_gmtoff / 60 / 60;
+ int m = labs((tm.tm_gmtoff / 60) % 60);
+ snprintf(buf + tail, ELEMENTSOF(buf) - tail, "%+03d:%02d", h, m);
+ break;
+ }
+
+ case OUTPUT_SHORT:
+ case OUTPUT_SHORT_PRECISE:
+
+ if (strftime(buf, sizeof(buf), "%b %d %H:%M:%S",
+ localtime_or_gmtime_r(&t, &tm, flags & OUTPUT_UTC)) <= 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to format syslog time");
+
+ if (mode == OUTPUT_SHORT_PRECISE) {
+ size_t k;
+
+ assert(sizeof(buf) > strlen(buf));
+ k = sizeof(buf) - strlen(buf);
+
+ r = snprintf(buf + strlen(buf), k, ".%06"PRIu64, display_ts->realtime % USEC_PER_SEC);
+ if (r <= 0 || (size_t) r >= k) /* too long? */
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to format precise time");
+ }
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ fputs(buf, f);
+ return (int) strlen(buf);
+}
+
+static int output_short(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ const Set *output_fields,
+ const size_t highlight[2],
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id) {
+
+ int r;
+ const void *data;
+ size_t length, n = 0;
+ _cleanup_free_ char *hostname = NULL, *identifier = NULL, *comm = NULL, *pid = NULL, *fake_pid = NULL,
+ *message = NULL, *priority = NULL, *transport = NULL,
+ *config_file = NULL, *unit = NULL, *user_unit = NULL, *documentation_url = NULL;
+ size_t hostname_len = 0, identifier_len = 0, comm_len = 0, pid_len = 0, fake_pid_len = 0, message_len = 0,
+ priority_len = 0, transport_len = 0, config_file_len = 0,
+ unit_len = 0, user_unit_len = 0, documentation_url_len = 0;
+ int p = LOG_INFO;
+ bool ellipsized = false, audit;
+ const ParseFieldVec fields[] = {
+ PARSE_FIELD_VEC_ENTRY("_PID=", &pid, &pid_len),
+ PARSE_FIELD_VEC_ENTRY("_COMM=", &comm, &comm_len),
+ PARSE_FIELD_VEC_ENTRY("MESSAGE=", &message, &message_len),
+ PARSE_FIELD_VEC_ENTRY("PRIORITY=", &priority, &priority_len),
+ PARSE_FIELD_VEC_ENTRY("_TRANSPORT=", &transport, &transport_len),
+ PARSE_FIELD_VEC_ENTRY("_HOSTNAME=", &hostname, &hostname_len),
+ PARSE_FIELD_VEC_ENTRY("SYSLOG_PID=", &fake_pid, &fake_pid_len),
+ PARSE_FIELD_VEC_ENTRY("SYSLOG_IDENTIFIER=", &identifier, &identifier_len),
+ PARSE_FIELD_VEC_ENTRY("CONFIG_FILE=", &config_file, &config_file_len),
+ PARSE_FIELD_VEC_ENTRY("_SYSTEMD_UNIT=", &unit, &unit_len),
+ PARSE_FIELD_VEC_ENTRY("_SYSTEMD_USER_UNIT=", &user_unit, &user_unit_len),
+ PARSE_FIELD_VEC_ENTRY("DOCUMENTATION=", &documentation_url, &documentation_url_len),
+ };
+ size_t highlight_shifted[] = {highlight ? highlight[0] : 0, highlight ? highlight[1] : 0};
+
+ assert(f);
+ assert(j);
+ assert(display_ts);
+ assert(boot_id);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ /* Set the threshold to one bigger than the actual print threshold, so that if the line is actually
+ * longer than what we're willing to print, ellipsization will occur. This way we won't output a
+ * misleading line without any indication of truncation.
+ */
+ (void) sd_journal_set_data_threshold(j, flags & (OUTPUT_SHOW_ALL|OUTPUT_FULL_WIDTH) ? 0 : PRINT_CHAR_THRESHOLD + 1);
+
+ JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) {
+ r = parse_fieldv(data, length, fields, ELEMENTSOF(fields));
+ if (r < 0)
+ return r;
+ }
+ if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
+ log_debug_errno(r, "Skipping message we can't read: %m");
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to get journal fields: %m");
+
+ if (!message) {
+ log_debug("Skipping message without MESSAGE= field.");
+ return 0;
+ }
+
+ if (!(flags & OUTPUT_SHOW_ALL))
+ strip_tab_ansi(&message, &message_len, highlight_shifted);
+
+ if (flags & OUTPUT_TRUNCATE_NEWLINE)
+ truncate_nl_full(message, &message_len);
+
+ if (priority_len == 1 && *priority >= '0' && *priority <= '7')
+ p = *priority - '0';
+
+ audit = streq_ptr(transport, "audit");
+
+ if (IN_SET(mode, OUTPUT_SHORT_MONOTONIC, OUTPUT_SHORT_DELTA))
+ r = output_timestamp_monotonic(f, mode, display_ts, boot_id, previous_display_ts, previous_boot_id);
+ else
+ r = output_timestamp_realtime(f, j, mode, flags, display_ts);
+ if (r < 0)
+ return r;
+ n += r;
+
+ if (flags & OUTPUT_NO_HOSTNAME) {
+ /* Suppress display of the hostname if this is requested. */
+ hostname = mfree(hostname);
+ hostname_len = 0;
+ }
+
+ if (hostname && shall_print(hostname, hostname_len, flags)) {
+ fprintf(f, " %.*s", (int) hostname_len, hostname);
+ n += hostname_len + 1;
+ }
+
+ if (mode == OUTPUT_WITH_UNIT && ((unit && shall_print(unit, unit_len, flags)) ||
+ (user_unit && shall_print(user_unit, user_unit_len, flags)))) {
+ if (unit) {
+ fprintf(f, " %.*s", (int) unit_len, unit);
+ n += unit_len + 1;
+ }
+ if (user_unit) {
+ if (unit)
+ fprintf(f, "/%.*s", (int) user_unit_len, user_unit);
+ else
+ fprintf(f, " %.*s", (int) user_unit_len, user_unit);
+ n += unit_len + 1;
+ }
+ } else if (identifier && shall_print(identifier, identifier_len, flags)) {
+ fprintf(f, " %.*s", (int) identifier_len, identifier);
+ n += identifier_len + 1;
+ } else if (comm && shall_print(comm, comm_len, flags)) {
+ fprintf(f, " %.*s", (int) comm_len, comm);
+ n += comm_len + 1;
+ } else
+ fputs(" unknown", f);
+
+ if (pid && shall_print(pid, pid_len, flags)) {
+ fprintf(f, "[%.*s]", (int) pid_len, pid);
+ n += pid_len + 2;
+ } else if (fake_pid && shall_print(fake_pid, fake_pid_len, flags)) {
+ fprintf(f, "[%.*s]", (int) fake_pid_len, fake_pid);
+ n += fake_pid_len + 2;
+ }
+
+ fputs(": ", f);
+
+ if (urlify_enabled()) {
+ _cleanup_free_ char *c = NULL;
+
+ /* Insert a hyperlink to a documentation URL before the message. Note that we don't make the
+ * whole message a hyperlink, since otherwise the whole screen might end up being just
+ * hyperlinks. Moreover, we want to be able to highlight parts of the message (such as the
+ * config file, see below) hence let's keep the documentation URL link separate. */
+
+ if (documentation_url && shall_print(documentation_url, documentation_url_len, flags)) {
+ c = strndup(documentation_url, documentation_url_len);
+ if (!c)
+ return log_oom();
+
+ if (!documentation_url_is_valid(c)) /* Eat up invalid links */
+ c = mfree(c);
+ }
+
+ if (!c)
+ (void) url_from_catalog(j, &c); /* Acquire from catalog if not embedded in log message itself */
+
+ if (c) {
+ _cleanup_free_ char *urlified = NULL;
+
+ if (terminal_urlify(c, special_glyph(SPECIAL_GLYPH_EXTERNAL_LINK), &urlified) >= 0) {
+ fputs(urlified, f);
+ fputc(' ', f);
+ }
+ }
+ }
+
+ if (!(flags & OUTPUT_SHOW_ALL) && !utf8_is_printable(message, message_len))
+ fprintf(f, "[%s blob data]\n", FORMAT_BYTES(message_len));
+ else {
+
+ /* URLify config_file string in message, if the message starts with it.
+ * Skip URLification if the highlighted pattern overlaps. */
+ if (config_file &&
+ message_len >= config_file_len &&
+ memcmp(message, config_file, config_file_len) == 0 &&
+ (message_len == config_file_len || IN_SET(message[config_file_len], ':', ' ')) &&
+ (!highlight || highlight_shifted[0] == 0 || highlight_shifted[0] > config_file_len)) {
+
+ _cleanup_free_ char *t = NULL, *urlified = NULL;
+
+ t = strndup(config_file, config_file_len);
+ if (t && terminal_urlify_path(t, NULL, &urlified) >= 0) {
+ size_t urlified_len = strlen(urlified);
+ size_t shift = urlified_len - config_file_len;
+ char *joined;
+
+ joined = realloc(urlified, message_len + shift);
+ if (joined) {
+ memcpy(joined + urlified_len, message + config_file_len, message_len - config_file_len);
+ free_and_replace(message, joined);
+ TAKE_PTR(urlified);
+ message_len += shift;
+ if (highlight) {
+ highlight_shifted[0] += shift;
+ highlight_shifted[1] += shift;
+ }
+ }
+ }
+ }
+
+ ellipsized |=
+ print_multiline(f, n + 2, n_columns, flags, p, audit,
+ message, message_len,
+ highlight_shifted);
+ }
+
+ if (flags & OUTPUT_CATALOG)
+ (void) print_catalog(f, j);
+
+ return ellipsized;
+}
+
+static int output_verbose(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ const Set *output_fields,
+ const size_t highlight[2],
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id) {
+
+ const void *data;
+ size_t length;
+ _cleanup_free_ char *cursor = NULL;
+ char buf[FORMAT_TIMESTAMP_MAX + 7];
+ const char *timestamp;
+ int r;
+
+ assert(f);
+ assert(j);
+ assert(display_ts);
+ assert(boot_id);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ (void) sd_journal_set_data_threshold(j, 0);
+
+ if (!VALID_REALTIME(display_ts->realtime))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No valid realtime timestamp available");
+
+ r = sd_journal_get_cursor(j, &cursor);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get cursor: %m");
+
+ timestamp = format_timestamp_style(buf, sizeof buf, display_ts->realtime,
+ flags & OUTPUT_UTC ? TIMESTAMP_US_UTC : TIMESTAMP_US);
+ fprintf(f, "%s%s%s %s[%s]%s\n",
+ timestamp && (flags & OUTPUT_COLOR) ? ANSI_UNDERLINE : "",
+ timestamp ?: "(no timestamp)",
+ timestamp && (flags & OUTPUT_COLOR) ? ANSI_NORMAL : "",
+ (flags & OUTPUT_COLOR) ? ANSI_GREY : "",
+ cursor,
+ (flags & OUTPUT_COLOR) ? ANSI_NORMAL : "");
+
+ JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) {
+ _cleanup_free_ char *urlified = NULL;
+ const char *on = "", *off = "";
+ const char *c, *p = NULL;
+ size_t fieldlen, valuelen;
+
+ c = memchr(data, '=', length);
+ if (!c)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field.");
+
+ fieldlen = c - (const char*) data;
+ if (!journal_field_valid(data, fieldlen, true))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field.");
+
+ r = field_set_test(output_fields, data, fieldlen);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ continue;
+
+ valuelen = length - 1 - fieldlen;
+ p = c + 1;
+
+ if (flags & OUTPUT_COLOR) {
+ if (startswith(data, "MESSAGE=")) {
+ on = ANSI_HIGHLIGHT;
+ off = ANSI_NORMAL;
+ } else if (startswith(data, "CONFIG_FILE=")) {
+ _cleanup_free_ char *u = NULL;
+
+ u = memdup_suffix0(p, valuelen);
+ if (!u)
+ return log_oom();
+
+ if (terminal_urlify_path(u, NULL, &urlified) >= 0) {
+ p = urlified;
+ valuelen = strlen(urlified);
+ }
+
+ } else if (startswith(data, "_")) {
+ /* Highlight trusted data as such */
+ on = ANSI_GREEN;
+ off = ANSI_NORMAL;
+ }
+ }
+
+ if ((flags & OUTPUT_SHOW_ALL) ||
+ (((length < PRINT_CHAR_THRESHOLD) || flags & OUTPUT_FULL_WIDTH)
+ && utf8_is_printable(data, length))) {
+ fprintf(f, " %s%.*s=", on, (int) fieldlen, (const char*)data);
+ print_multiline(f, 4 + fieldlen + 1, 0, OUTPUT_FULL_WIDTH, 0, false,
+ p, valuelen,
+ NULL);
+ fputs(off, f);
+ } else
+ fprintf(f, " %s%.*s=[%s blob data]%s\n",
+ on,
+ (int) (c - (const char*) data),
+ (const char*) data,
+ FORMAT_BYTES(length - (c - (const char *) data) - 1),
+ off);
+ }
+ if (r < 0)
+ return r;
+
+ if (flags & OUTPUT_CATALOG)
+ (void) print_catalog(f, j);
+
+ return 0;
+}
+
+static int output_export(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ const Set *output_fields,
+ const size_t highlight[2],
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id) {
+
+ sd_id128_t journal_boot_id, seqnum_id;
+ _cleanup_free_ char *cursor = NULL;
+ usec_t monotonic, realtime;
+ const void *data;
+ uint64_t seqnum;
+ size_t length;
+ int r;
+
+ assert(j);
+ assert(display_ts);
+ assert(boot_id);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ (void) sd_journal_set_data_threshold(j, 0);
+
+ r = sd_journal_get_cursor(j, &cursor);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get cursor: %m");
+
+ r = sd_journal_get_realtime_usec(j, &realtime);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get realtime timestamp: %m");
+
+ r = sd_journal_get_monotonic_usec(j, &monotonic, &journal_boot_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get monotonic timestamp: %m");
+
+ r = sd_journal_get_seqnum(j, &seqnum, &seqnum_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get seqnum: %m");
+
+ fprintf(f,
+ "__CURSOR=%s\n"
+ "__REALTIME_TIMESTAMP=" USEC_FMT "\n"
+ "__MONOTONIC_TIMESTAMP=" USEC_FMT "\n"
+ "__SEQNUM=%" PRIu64 "\n"
+ "__SEQNUM_ID=%s\n"
+ "_BOOT_ID=%s\n",
+ cursor,
+ realtime,
+ monotonic,
+ seqnum,
+ SD_ID128_TO_STRING(seqnum_id),
+ SD_ID128_TO_STRING(journal_boot_id));
+
+ JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) {
+ size_t fieldlen;
+ const char *c;
+
+ /* We already printed the boot id from the data in the header, hence let's suppress it here */
+ if (memory_startswith(data, length, "_BOOT_ID="))
+ continue;
+
+ c = memchr(data, '=', length);
+ if (!c)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field.");
+
+ fieldlen = c - (const char*) data;
+ if (!journal_field_valid(data, fieldlen, true))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field.");
+
+ r = field_set_test(output_fields, data, fieldlen);
+ if (r < 0)
+ return r;
+ if (!r)
+ continue;
+
+ if (utf8_is_printable_newline(data, length, false))
+ fwrite(data, length, 1, f);
+ else {
+ uint64_t le64;
+
+ fwrite(data, fieldlen, 1, f);
+ fputc('\n', f);
+ le64 = htole64(length - fieldlen - 1);
+ fwrite(&le64, sizeof(le64), 1, f);
+ fwrite(c + 1, length - fieldlen - 1, 1, f);
+ }
+
+ fputc('\n', f);
+ }
+ if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) {
+ log_debug_errno(r, "Skipping message we can't read: %m");
+ return 0;
+ }
+
+ if (r < 0)
+ return r;
+
+ fputc('\n', f);
+
+ return 0;
+}
+
+void json_escape(
+ FILE *f,
+ const char* p,
+ size_t l,
+ OutputFlags flags) {
+
+ assert(f);
+ assert(p);
+
+ if (!(flags & OUTPUT_SHOW_ALL) && l >= JSON_THRESHOLD)
+ fputs("null", f);
+
+ else if (!(flags & OUTPUT_SHOW_ALL) && !utf8_is_printable(p, l)) {
+ bool not_first = false;
+
+ fputs("[ ", f);
+
+ while (l > 0) {
+ if (not_first)
+ fprintf(f, ", %u", (uint8_t) *p);
+ else {
+ not_first = true;
+ fprintf(f, "%u", (uint8_t) *p);
+ }
+
+ p++;
+ l--;
+ }
+
+ fputs(" ]", f);
+ } else {
+ fputc('"', f);
+
+ while (l > 0) {
+ if (IN_SET(*p, '"', '\\')) {
+ fputc('\\', f);
+ fputc(*p, f);
+ } else if (*p == '\n')
+ fputs("\\n", f);
+ else if ((uint8_t) *p < ' ')
+ fprintf(f, "\\u%04x", (uint8_t) *p);
+ else
+ fputc(*p, f);
+
+ p++;
+ l--;
+ }
+
+ fputc('"', f);
+ }
+}
+
+typedef struct JsonData {
+ JsonVariant* name;
+ JsonVariant* values;
+} JsonData;
+
+static JsonData* json_data_free(JsonData *d) {
+ if (!d)
+ return NULL;
+
+ json_variant_unref(d->name);
+ json_variant_unref(d->values);
+
+ return mfree(d);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(JsonData*, json_data_free);
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(json_data_hash_ops_free,
+ char, string_hash_func, string_compare_func,
+ JsonData, json_data_free);
+
+static int update_json_data(
+ Hashmap *h,
+ OutputFlags flags,
+ const char *name,
+ const void *value,
+ size_t size) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ JsonData *d;
+ int r;
+
+ assert(name);
+ assert(value);
+
+ if (size == SIZE_MAX)
+ size = strlen(value);
+
+ if (!(flags & OUTPUT_SHOW_ALL) && strlen(name) + 1 + size >= JSON_THRESHOLD)
+ r = json_variant_new_null(&v);
+ else if (utf8_is_printable(value, size))
+ r = json_variant_new_stringn(&v, value, size);
+ else
+ r = json_variant_new_array_bytes(&v, value, size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate JSON data: %m");
+
+ d = hashmap_get(h, name);
+ if (d) {
+ r = json_variant_append_array(&d->values, v);
+ if (r < 0)
+ return log_error_errno(r, "Failed to append JSON value into array: %m");
+ } else {
+ _cleanup_(json_data_freep) JsonData *e = NULL;
+
+ e = new0(JsonData, 1);
+ if (!e)
+ return log_oom();
+
+ r = json_variant_new_string(&e->name, name);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate JSON name variant: %m");
+
+ r = json_variant_append_array(&e->values, v);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create JSON value array: %m");
+
+ r = hashmap_put(h, json_variant_string(e->name), e);
+ if (r < 0)
+ return log_error_errno(r, "Failed to insert JSON data into hashmap: %m");
+
+ TAKE_PTR(e);
+ }
+
+ return 0;
+}
+
+static int update_json_data_split(
+ Hashmap *h,
+ OutputFlags flags,
+ const Set *output_fields,
+ const void *data,
+ size_t size) {
+
+ size_t fieldlen;
+ const char *eq;
+ char *name;
+
+ assert(h);
+ assert(data || size == 0);
+
+ if (memory_startswith(data, size, "_BOOT_ID="))
+ return 0;
+
+ eq = memchr(data, '=', MIN(size, JSON_THRESHOLD));
+ if (!eq)
+ return 0;
+
+ fieldlen = eq - (const char*) data;
+ if (!journal_field_valid(data, fieldlen, true))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid field.");
+
+ name = strndupa_safe(data, fieldlen);
+ if (output_fields && !set_contains(output_fields, name))
+ return 0;
+
+ return update_json_data(h, flags, name, eq + 1, size - fieldlen - 1);
+}
+
+static int output_json(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ const Set *output_fields,
+ const size_t highlight[2],
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id) {
+
+ char usecbuf[CONST_MAX(DECIMAL_STR_MAX(usec_t), DECIMAL_STR_MAX(uint64_t))];
+ _cleanup_(json_variant_unrefp) JsonVariant *object = NULL;
+ _cleanup_hashmap_free_ Hashmap *h = NULL;
+ sd_id128_t journal_boot_id, seqnum_id;
+ _cleanup_free_ char *cursor = NULL;
+ usec_t realtime, monotonic;
+ JsonVariant **array = NULL;
+ JsonData *d;
+ uint64_t seqnum;
+ size_t n = 0;
+ int r;
+
+ assert(j);
+ assert(display_ts);
+ assert(boot_id);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ (void) sd_journal_set_data_threshold(j, flags & OUTPUT_SHOW_ALL ? 0 : JSON_THRESHOLD);
+
+ r = sd_journal_get_cursor(j, &cursor);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get cursor: %m");
+
+ r = sd_journal_get_realtime_usec(j, &realtime);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get realtime timestamp: %m");
+
+ r = sd_journal_get_monotonic_usec(j, &monotonic, &journal_boot_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get monotonic timestamp: %m");
+
+ r = sd_journal_get_seqnum(j, &seqnum, &seqnum_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get seqnum: %m");
+
+ h = hashmap_new(&json_data_hash_ops_free);
+ if (!h)
+ return log_oom();
+
+ r = update_json_data(h, flags, "__CURSOR", cursor, SIZE_MAX);
+ if (r < 0)
+ return r;
+
+ xsprintf(usecbuf, USEC_FMT, realtime);
+ r = update_json_data(h, flags, "__REALTIME_TIMESTAMP", usecbuf, SIZE_MAX);
+ if (r < 0)
+ return r;
+
+ xsprintf(usecbuf, USEC_FMT, monotonic);
+ r = update_json_data(h, flags, "__MONOTONIC_TIMESTAMP", usecbuf, SIZE_MAX);
+ if (r < 0)
+ return r;
+
+ r = update_json_data(h, flags, "_BOOT_ID", SD_ID128_TO_STRING(journal_boot_id), SIZE_MAX);
+ if (r < 0)
+ return r;
+
+ xsprintf(usecbuf, USEC_FMT, seqnum);
+ r = update_json_data(h, flags, "__SEQNUM", usecbuf, SIZE_MAX);
+ if (r < 0)
+ return r;
+
+ r = update_json_data(h, flags, "__SEQNUM_ID", SD_ID128_TO_STRING(seqnum_id), SIZE_MAX);
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ const void *data;
+ size_t size;
+
+ r = sd_journal_enumerate_data(j, &data, &size);
+ if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
+ log_debug_errno(r, "Skipping message we can't read: %m");
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to read journal: %m");
+ if (r == 0)
+ break;
+
+ r = update_json_data_split(h, flags, output_fields, data, size);
+ if (r < 0)
+ return r;
+ }
+
+ array = new(JsonVariant*, hashmap_size(h)*2);
+ if (!array)
+ return log_oom();
+
+ CLEANUP_ARRAY(array, n, json_variant_unref_many);
+
+ HASHMAP_FOREACH(d, h) {
+ assert(json_variant_elements(d->values) > 0);
+
+ array[n++] = json_variant_ref(d->name);
+
+ if (json_variant_elements(d->values) == 1)
+ array[n++] = json_variant_ref(json_variant_by_index(d->values, 0));
+ else
+ array[n++] = json_variant_ref(d->values);
+ }
+
+ r = json_variant_new_object(&object, array, n);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate JSON object: %m");
+
+ return json_variant_dump(object,
+ output_mode_to_json_format_flags(mode) |
+ (FLAGS_SET(flags, OUTPUT_COLOR) ? JSON_FORMAT_COLOR : 0),
+ f, NULL);
+}
+
+static int output_cat_field(
+ FILE *f,
+ sd_journal *j,
+ OutputFlags flags,
+ int prio,
+ const char *field,
+ const size_t highlight[2]) {
+
+ const char *color_on = "", *color_off = "", *highlight_on = "";
+ const void *data;
+ size_t l, fl;
+ int r;
+
+ if (FLAGS_SET(flags, OUTPUT_COLOR))
+ get_log_colors(prio, &color_on, &color_off, &highlight_on);
+
+ r = sd_journal_get_data(j, field, &data, &l);
+ if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
+ log_debug_errno(r, "Skipping message we can't read: %m");
+ return 0;
+ }
+ if (r == -ENOENT) /* An entry without the requested field */
+ return 0;
+ if (r < 0)
+ return log_error_errno(r, "Failed to get data: %m");
+
+ fl = strlen(field);
+ assert(l >= fl + 1);
+ assert(((char*) data)[fl] == '=');
+
+ data = (const uint8_t*) data + fl + 1;
+ l -= fl + 1;
+
+ if (FLAGS_SET(flags, OUTPUT_COLOR)) {
+ if (highlight) {
+ assert(highlight[0] <= highlight[1]);
+ assert(highlight[1] <= l);
+
+ fputs(color_on, f);
+ fwrite((const char*) data, 1, highlight[0], f);
+ fputs(highlight_on, f);
+ fwrite((const char*) data + highlight[0], 1, highlight[1] - highlight[0], f);
+ fputs(color_on, f);
+ fwrite((const char*) data + highlight[1], 1, l - highlight[1], f);
+ fputs(color_off, f);
+ } else {
+ fputs(color_on, f);
+ fwrite((const char*) data, 1, l, f);
+ fputs(color_off, f);
+ }
+ } else
+ fwrite((const char*) data, 1, l, f);
+
+ fputc('\n', f);
+ return 0;
+}
+
+static int output_cat(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ const Set *output_fields,
+ const size_t highlight[2],
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id) {
+
+ int r, prio = LOG_INFO;
+ const char *field;
+
+ assert(j);
+ assert(f);
+ assert(display_ts);
+ assert(boot_id);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ (void) sd_journal_set_data_threshold(j, 0);
+
+ if (FLAGS_SET(flags, OUTPUT_COLOR)) {
+ const void *data;
+ size_t l;
+
+ /* Determine priority of this entry, so that we can color it nicely */
+
+ r = sd_journal_get_data(j, "PRIORITY", &data, &l);
+ if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
+ log_debug_errno(r, "Skipping message we can't read: %m");
+ return 0;
+ }
+ if (r < 0) {
+ if (r != -ENOENT)
+ return log_error_errno(r, "Failed to get data: %m");
+
+ /* An entry without PRIORITY */
+ } else if (l == 10 && memcmp(data, "PRIORITY=", 9) == 0) {
+ char c = ((char*) data)[9];
+
+ if (c >= '0' && c <= '7')
+ prio = c - '0';
+ }
+ }
+
+ if (set_isempty(output_fields))
+ return output_cat_field(f, j, flags, prio, "MESSAGE", highlight);
+
+ SET_FOREACH(field, output_fields) {
+ r = output_cat_field(f, j, flags, prio, field, streq(field, "MESSAGE") ? highlight : NULL);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int get_display_timestamp(
+ sd_journal *j,
+ dual_timestamp *ret_display_ts,
+ sd_id128_t *ret_boot_id) {
+
+ const void *data;
+ _cleanup_free_ char *realtime = NULL, *monotonic = NULL;
+ size_t length = 0, realtime_len = 0, monotonic_len = 0;
+ const ParseFieldVec message_fields[] = {
+ PARSE_FIELD_VEC_ENTRY("_SOURCE_REALTIME_TIMESTAMP=", &realtime, &realtime_len),
+ PARSE_FIELD_VEC_ENTRY("_SOURCE_MONOTONIC_TIMESTAMP=", &monotonic, &monotonic_len),
+ };
+ int r;
+ bool realtime_good = false, monotonic_good = false, boot_id_good = false;
+
+ assert(j);
+ assert(ret_display_ts);
+ assert(ret_boot_id);
+
+ JOURNAL_FOREACH_DATA_RETVAL(j, data, length, r) {
+ r = parse_fieldv(data, length, message_fields, ELEMENTSOF(message_fields));
+ if (r < 0)
+ return r;
+
+ if (realtime && monotonic)
+ break;
+ }
+ if (r < 0)
+ return r;
+
+ if (realtime)
+ realtime_good = safe_atou64(realtime, &ret_display_ts->realtime) >= 0;
+ if (!realtime_good || !VALID_REALTIME(ret_display_ts->realtime))
+ realtime_good = sd_journal_get_realtime_usec(j, &ret_display_ts->realtime) >= 0;
+ if (!realtime_good)
+ ret_display_ts->realtime = USEC_INFINITY;
+
+ if (monotonic)
+ monotonic_good = safe_atou64(monotonic, &ret_display_ts->monotonic) >= 0;
+ if (!monotonic_good || !VALID_MONOTONIC(ret_display_ts->monotonic))
+ monotonic_good = boot_id_good = sd_journal_get_monotonic_usec(j, &ret_display_ts->monotonic, ret_boot_id) >= 0;
+ if (!monotonic_good)
+ ret_display_ts->monotonic = USEC_INFINITY;
+
+ if (!boot_id_good)
+ boot_id_good = sd_journal_get_monotonic_usec(j, NULL, ret_boot_id) >= 0;
+ if (!boot_id_good)
+ *ret_boot_id = SD_ID128_NULL;
+
+ /* Restart all data before */
+ sd_journal_restart_data(j);
+ sd_journal_restart_unique(j);
+ sd_journal_restart_fields(j);
+
+ return 0;
+}
+
+typedef int (*output_func_t)(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ const Set *output_fields,
+ const size_t highlight[2],
+ const dual_timestamp *display_ts,
+ const sd_id128_t *boot_id,
+ const dual_timestamp *previous_display_ts,
+ const sd_id128_t *previous_boot_id);
+
+
+static output_func_t output_funcs[_OUTPUT_MODE_MAX] = {
+ [OUTPUT_SHORT] = output_short,
+ [OUTPUT_SHORT_ISO] = output_short,
+ [OUTPUT_SHORT_ISO_PRECISE] = output_short,
+ [OUTPUT_SHORT_PRECISE] = output_short,
+ [OUTPUT_SHORT_MONOTONIC] = output_short,
+ [OUTPUT_SHORT_DELTA] = output_short,
+ [OUTPUT_SHORT_UNIX] = output_short,
+ [OUTPUT_SHORT_FULL] = output_short,
+ [OUTPUT_VERBOSE] = output_verbose,
+ [OUTPUT_EXPORT] = output_export,
+ [OUTPUT_JSON] = output_json,
+ [OUTPUT_JSON_PRETTY] = output_json,
+ [OUTPUT_JSON_SSE] = output_json,
+ [OUTPUT_JSON_SEQ] = output_json,
+ [OUTPUT_CAT] = output_cat,
+ [OUTPUT_WITH_UNIT] = output_short,
+};
+
+int show_journal_entry(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ Set *output_fields,
+ const size_t highlight[2],
+ bool *ellipsized,
+ dual_timestamp *previous_display_ts,
+ sd_id128_t *previous_boot_id) {
+
+ dual_timestamp display_ts = DUAL_TIMESTAMP_NULL;
+ sd_id128_t boot_id = SD_ID128_NULL;
+ int r;
+
+ assert(mode >= 0);
+ assert(mode < _OUTPUT_MODE_MAX);
+ assert(previous_display_ts);
+ assert(previous_boot_id);
+
+ if (n_columns <= 0)
+ n_columns = columns();
+
+ r = get_display_timestamp(j, &display_ts, &boot_id);
+ if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
+ log_debug_errno(r, "Skipping message we can't read: %m");
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to get journal fields: %m");
+
+ r = output_funcs[mode](
+ f,
+ j,
+ mode,
+ n_columns,
+ flags,
+ output_fields,
+ highlight,
+ &display_ts,
+ &boot_id,
+ previous_display_ts,
+ previous_boot_id);
+
+ /* Store timestamp and boot ID for next iteration */
+ *previous_display_ts = display_ts;
+ *previous_boot_id = boot_id;
+
+ if (ellipsized && r > 0)
+ *ellipsized = true;
+
+ return r;
+}
+
+static int maybe_print_begin_newline(FILE *f, OutputFlags *flags) {
+ assert(f);
+ assert(flags);
+
+ if (!(*flags & OUTPUT_BEGIN_NEWLINE))
+ return 0;
+
+ /* Print a beginning new line if that's request, but only once
+ * on the first line we print. */
+
+ fputc('\n', f);
+ *flags &= ~OUTPUT_BEGIN_NEWLINE;
+ return 0;
+}
+
+int show_journal(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ usec_t not_before,
+ unsigned how_many,
+ OutputFlags flags,
+ bool *ellipsized) {
+
+ int r;
+ unsigned line = 0;
+ bool need_seek = false;
+ int warn_cutoff = flags & OUTPUT_WARN_CUTOFF;
+ dual_timestamp previous_display_ts = DUAL_TIMESTAMP_NULL;
+ sd_id128_t previous_boot_id = SD_ID128_NULL;
+
+ assert(j);
+ assert(mode >= 0);
+ assert(mode < _OUTPUT_MODE_MAX);
+
+ if (how_many == UINT_MAX)
+ need_seek = true;
+ else {
+ /* Seek to end */
+ r = sd_journal_seek_tail(j);
+ if (r < 0)
+ return log_error_errno(r, "Failed to seek to tail: %m");
+
+ r = sd_journal_previous_skip(j, how_many);
+ if (r < 0)
+ return log_error_errno(r, "Failed to skip previous: %m");
+ }
+
+ for (;;) {
+ usec_t usec;
+
+ if (need_seek) {
+ r = sd_journal_next(j);
+ if (r < 0)
+ return log_error_errno(r, "Failed to iterate through journal: %m");
+ }
+
+ if (r == 0)
+ break;
+
+ need_seek = true;
+
+ if (not_before > 0) {
+ r = sd_journal_get_monotonic_usec(j, &usec, NULL);
+
+ /* -ESTALE is returned if the timestamp is not from this boot */
+ if (r == -ESTALE)
+ continue;
+ if (r < 0)
+ return log_error_errno(r, "Failed to get journal time: %m");
+
+ if (usec < not_before)
+ continue;
+ }
+
+ line++;
+ maybe_print_begin_newline(f, &flags);
+
+ r = show_journal_entry(
+ f,
+ j,
+ mode,
+ n_columns,
+ flags,
+ /* output_fields= */ NULL,
+ /* highlight= */ NULL,
+ ellipsized,
+ &previous_display_ts,
+ &previous_boot_id);
+ if (r < 0)
+ return r;
+ }
+
+ if (warn_cutoff && line < how_many && not_before > 0) {
+ sd_id128_t boot_id;
+ usec_t cutoff = 0;
+
+ /* Check whether the cutoff line is too early */
+
+ r = sd_id128_get_boot(&boot_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get boot id: %m");
+
+ r = sd_journal_get_cutoff_monotonic_usec(j, boot_id, &cutoff, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get journal cutoff time: %m");
+
+ if (r > 0 && not_before < cutoff) {
+ maybe_print_begin_newline(f, &flags);
+
+ /* If we logged *something* and no permission error happened, than we can reliably
+ * emit the warning about rotation. If we didn't log anything and access errors
+ * happened, emit hint about permissions. Otherwise, give a generic message, since we
+ * can't diagnose the issue. */
+
+ bool noaccess = journal_access_blocked(j);
+
+ if (line == 0 && noaccess)
+ fprintf(f, "Warning: some journal files were not opened due to insufficient permissions.\n");
+ else if (!noaccess)
+ fprintf(f, "Notice: journal has been rotated since unit was started, output may be incomplete.\n");
+ else
+ fprintf(f, "Warning: journal has been rotated since unit was started and some journal "
+ "files were not opened due to insufficient permissions, output may be incomplete.\n");
+ }
+
+ warn_cutoff = false;
+ }
+
+ return 0;
+}
+
+int add_matches_for_unit(sd_journal *j, const char *unit) {
+ const char *m1, *m2, *m3, *m4;
+ int r;
+
+ assert(j);
+ assert(unit);
+
+ m1 = strjoina("_SYSTEMD_UNIT=", unit);
+ m2 = strjoina("COREDUMP_UNIT=", unit);
+ m3 = strjoina("UNIT=", unit);
+ m4 = strjoina("OBJECT_SYSTEMD_UNIT=", unit);
+
+ (void)(
+ /* Look for messages from the service itself */
+ (r = sd_journal_add_match(j, m1, 0)) ||
+
+ /* Look for coredumps of the service */
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, "MESSAGE_ID=fc2e22bc6ee647b6b90729ab34a250b1", 0)) ||
+ (r = sd_journal_add_match(j, "_UID=0", 0)) ||
+ (r = sd_journal_add_match(j, m2, 0)) ||
+
+ /* Look for messages from PID 1 about this service */
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, "_PID=1", 0)) ||
+ (r = sd_journal_add_match(j, m3, 0)) ||
+
+ /* Look for messages from authorized daemons about this service */
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, "_UID=0", 0)) ||
+ (r = sd_journal_add_match(j, m4, 0))
+ );
+
+ if (r == 0 && endswith(unit, ".slice")) {
+ const char *m5;
+
+ m5 = strjoina("_SYSTEMD_SLICE=", unit);
+
+ /* Show all messages belonging to a slice */
+ (void)(
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, m5, 0))
+ );
+ }
+
+ return r;
+}
+
+int add_matches_for_user_unit(sd_journal *j, const char *unit, uid_t uid) {
+ int r;
+ char *m1, *m2, *m3, *m4;
+ char muid[sizeof("_UID=") + DECIMAL_STR_MAX(uid_t)];
+
+ assert(j);
+ assert(unit);
+
+ m1 = strjoina("_SYSTEMD_USER_UNIT=", unit);
+ m2 = strjoina("USER_UNIT=", unit);
+ m3 = strjoina("COREDUMP_USER_UNIT=", unit);
+ m4 = strjoina("OBJECT_SYSTEMD_USER_UNIT=", unit);
+ sprintf(muid, "_UID="UID_FMT, uid);
+
+ (void) (
+ /* Look for messages from the user service itself */
+ (r = sd_journal_add_match(j, m1, 0)) ||
+ (r = sd_journal_add_match(j, muid, 0)) ||
+
+ /* Look for messages from systemd about this service */
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, m2, 0)) ||
+ (r = sd_journal_add_match(j, muid, 0)) ||
+
+ /* Look for coredumps of the service */
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, m3, 0)) ||
+ (r = sd_journal_add_match(j, muid, 0)) ||
+ (r = sd_journal_add_match(j, "_UID=0", 0)) ||
+
+ /* Look for messages from authorized daemons about this service */
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, m4, 0)) ||
+ (r = sd_journal_add_match(j, muid, 0)) ||
+ (r = sd_journal_add_match(j, "_UID=0", 0))
+ );
+
+ if (r == 0 && endswith(unit, ".slice")) {
+ const char *m5;
+
+ m5 = strjoina("_SYSTEMD_USER_SLICE=", unit);
+
+ /* Show all messages belonging to a slice */
+ (void)(
+ (r = sd_journal_add_disjunction(j)) ||
+ (r = sd_journal_add_match(j, m5, 0)) ||
+ (r = sd_journal_add_match(j, muid, 0))
+ );
+ }
+
+ return r;
+}
+
+static int get_boot_id_for_machine(const char *machine, sd_id128_t *boot_id) {
+ _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+ _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, rootfd = -EBADF;
+ char buf[SD_ID128_UUID_STRING_MAX];
+ pid_t pid, child;
+ ssize_t k;
+ int r;
+
+ assert(machine);
+ assert(boot_id);
+
+ r = container_get_leader(machine, &pid);
+ if (r < 0)
+ return r;
+
+ r = namespace_open(pid, &pidnsfd, &mntnsfd, NULL, NULL, &rootfd);
+ if (r < 0)
+ return r;
+
+ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, pair) < 0)
+ return -errno;
+
+ r = namespace_fork("(sd-bootidns)", "(sd-bootid)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL,
+ pidnsfd, mntnsfd, -1, -1, rootfd, &child);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ int fd;
+
+ pair[0] = safe_close(pair[0]);
+
+ fd = open("/proc/sys/kernel/random/boot_id", O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0)
+ _exit(EXIT_FAILURE);
+
+ r = loop_read_exact(fd, buf, 36, false);
+ safe_close(fd);
+ if (r < 0)
+ _exit(EXIT_FAILURE);
+
+ k = send(pair[1], buf, 36, MSG_NOSIGNAL);
+ if (k != 36)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ pair[1] = safe_close(pair[1]);
+
+ r = wait_for_terminate_and_check("(sd-bootidns)", child, 0);
+ if (r < 0)
+ return r;
+ if (r != EXIT_SUCCESS)
+ return -EIO;
+
+ k = recv(pair[0], buf, 36, 0);
+ if (k != 36)
+ return -EIO;
+
+ buf[36] = 0;
+ r = sd_id128_from_string(buf, boot_id);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int add_match_boot_id(sd_journal *j, sd_id128_t id) {
+ char match[STRLEN("_BOOT_ID=") + SD_ID128_STRING_MAX];
+
+ assert(j);
+ assert(!sd_id128_is_null(id));
+
+ sd_id128_to_string(id, stpcpy(match, "_BOOT_ID="));
+ return sd_journal_add_match(j, match, strlen(match));
+}
+
+int add_match_this_boot(sd_journal *j, const char *machine) {
+ sd_id128_t boot_id;
+ int r;
+
+ assert(j);
+
+ if (machine) {
+ r = get_boot_id_for_machine(machine, &boot_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get boot id of container %s: %m", machine);
+ } else {
+ r = sd_id128_get_boot(&boot_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get boot id: %m");
+ }
+
+ r = add_match_boot_id(j, boot_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add match: %m");
+
+ r = sd_journal_add_conjunction(j);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add conjunction: %m");
+
+ return 0;
+}
+
+int show_journal_by_unit(
+ FILE *f,
+ const char *unit,
+ const char *log_namespace,
+ OutputMode mode,
+ unsigned n_columns,
+ usec_t not_before,
+ unsigned how_many,
+ uid_t uid,
+ OutputFlags flags,
+ int journal_open_flags,
+ bool system_unit,
+ bool *ellipsized) {
+
+ _cleanup_(sd_journal_closep) sd_journal *j = NULL;
+ int r;
+
+ assert(mode >= 0);
+ assert(mode < _OUTPUT_MODE_MAX);
+ assert(unit);
+
+ if (how_many <= 0)
+ return 0;
+
+ r = sd_journal_open_namespace(&j, log_namespace, journal_open_flags | SD_JOURNAL_INCLUDE_DEFAULT_NAMESPACE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open journal: %m");
+
+ if (system_unit)
+ r = add_matches_for_unit(j, unit);
+ else
+ r = add_matches_for_user_unit(j, unit, uid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add unit matches: %m");
+
+ r = sd_journal_add_conjunction(j);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add conjunction: %m");
+
+ r = add_match_this_boot(j, NULL);
+ if (r < 0)
+ return r;
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *filter = NULL;
+
+ filter = journal_make_match_string(j);
+ if (!filter)
+ return log_oom();
+
+ log_debug("Journal filter: %s", filter);
+ }
+
+ return show_journal(f, j, mode, n_columns, not_before, how_many, flags, ellipsized);
+}
+
+static int discover_next_boot(
+ sd_journal *j,
+ sd_id128_t previous_boot_id,
+ bool advance_older,
+ BootId *ret) {
+
+ _cleanup_set_free_ Set *broken_ids = NULL;
+ int r;
+
+ assert(j);
+ assert(ret);
+
+ /* We expect the journal to be on the last position of a boot
+ * (in relation to the direction we are going), so that the next
+ * invocation of sd_journal_next/previous will be from a different
+ * boot. We then collect any information we desire and then jump
+ * to the last location of the new boot by using a _BOOT_ID match
+ * coming from the other journal direction. */
+
+ /* Make sure we aren't restricted by any _BOOT_ID matches, so that
+ * we can actually advance to a *different* boot. */
+ sd_journal_flush_matches(j);
+
+ for (;;) {
+ sd_id128_t *id_dup;
+ BootId boot;
+
+ r = sd_journal_step_one(j, !advance_older);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ *ret = (BootId) {};
+ return 0; /* End of journal, yay. */
+ }
+
+ r = sd_journal_get_monotonic_usec(j, NULL, &boot.id);
+ if (r < 0)
+ return r;
+
+ /* We iterate through this in a loop, until the boot ID differs from the previous one. Note that
+ * normally, this will only require a single iteration, as we moved to the last entry of the previous
+ * boot entry already. However, it might happen that the per-journal-field entry arrays are less
+ * complete than the main entry array, and hence might reference an entry that's not actually the last
+ * one of the boot ID as last one. Let's hence use the per-field array is initial seek position to
+ * speed things up, but let's not trust that it is complete, and hence, manually advance as
+ * necessary. */
+
+ if (!sd_id128_is_null(previous_boot_id) && sd_id128_equal(boot.id, previous_boot_id))
+ continue;
+
+ if (set_contains(broken_ids, &boot.id))
+ continue;
+
+ /* Yay, we found a new boot ID from the entry object. Let's check there exist corresponding
+ * entries matching with the _BOOT_ID= data. */
+
+ r = add_match_boot_id(j, boot.id);
+ if (r < 0)
+ return r;
+
+ /* First, seek to the first (or the last when we are going upwards) occurrence of this boot ID.
+ * You may think this is redundant. Yes, that's redundant unless the journal is corrupted.
+ * But when the journal is corrupted, especially, badly 'truncated', then the below may fail.
+ * See https://github.com/systemd/systemd/pull/29334#issuecomment-1736567951. */
+ if (advance_older)
+ r = sd_journal_seek_tail(j);
+ else
+ r = sd_journal_seek_head(j);
+ if (r < 0)
+ return r;
+
+ r = sd_journal_step_one(j, 0);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("Whoopsie! We found a boot ID %s but can't read its first entry. "
+ "The journal seems to be corrupted. Ignoring the boot ID.",
+ SD_ID128_TO_STRING(boot.id));
+ goto try_again;
+ }
+
+ r = sd_journal_get_realtime_usec(j, &boot.first_usec);
+ if (r < 0)
+ return r;
+
+ /* Next, seek to the last occurrence of this boot ID. */
+ if (advance_older)
+ r = sd_journal_seek_head(j);
+ else
+ r = sd_journal_seek_tail(j);
+ if (r < 0)
+ return r;
+
+ r = sd_journal_step_one(j, 0);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("Whoopsie! We found a boot ID %s but can't read its last entry. "
+ "The journal seems to be corrupted. Ignoring the boot ID.",
+ SD_ID128_TO_STRING(boot.id));
+ goto try_again;
+ }
+
+ r = sd_journal_get_realtime_usec(j, &boot.last_usec);
+ if (r < 0)
+ return r;
+
+ sd_journal_flush_matches(j);
+ *ret = boot;
+ return 1;
+
+ try_again:
+ /* Save the bad boot ID. */
+ id_dup = newdup(sd_id128_t, &boot.id, 1);
+ if (!id_dup)
+ return -ENOMEM;
+
+ r = set_ensure_consume(&broken_ids, &id128_hash_ops_free, id_dup);
+ if (r < 0)
+ return r;
+
+ /* Move to the previous position again. */
+ sd_journal_flush_matches(j);
+
+ if (!sd_id128_is_null(previous_boot_id)) {
+ r = add_match_boot_id(j, previous_boot_id);
+ if (r < 0)
+ return r;
+ }
+
+ if (advance_older)
+ r = sd_journal_seek_head(j);
+ else
+ r = sd_journal_seek_tail(j);
+ if (r < 0)
+ return r;
+
+ r = sd_journal_step_one(j, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENODATA),
+ "Whoopsie! Cannot seek to the last entry of boot %s.",
+ SD_ID128_TO_STRING(previous_boot_id));
+
+ sd_journal_flush_matches(j);
+ }
+}
+
+int journal_find_boot_by_id(sd_journal *j, sd_id128_t boot_id) {
+ int r;
+
+ assert(j);
+ assert(!sd_id128_is_null(boot_id));
+
+ sd_journal_flush_matches(j);
+
+ r = add_match_boot_id(j, boot_id);
+ if (r < 0)
+ return r;
+
+ r = sd_journal_seek_head(j); /* seek to oldest */
+ if (r < 0)
+ return r;
+
+ r = sd_journal_next(j); /* read the oldest entry */
+ if (r < 0)
+ return r;
+
+ /* At this point the read pointer is positioned at the oldest occurrence of the reference boot ID.
+ * After flushing the matches, one more invocation of _previous() will hence place us at the
+ * following entry, which must then have an older boot ID */
+
+ sd_journal_flush_matches(j);
+ return r > 0;
+}
+
+int journal_find_boot_by_offset(sd_journal *j, int offset, sd_id128_t *ret) {
+ bool advance_older;
+ int r;
+
+ assert(j);
+ assert(ret);
+
+ /* Adjust for the asymmetry that offset 0 is the last (and current) boot, while 1 is considered the
+ * (chronological) first boot in the journal. */
+ advance_older = offset <= 0;
+
+ if (advance_older)
+ r = sd_journal_seek_tail(j); /* seek to newest */
+ else
+ r = sd_journal_seek_head(j); /* seek to oldest */
+ if (r < 0)
+ return r;
+
+ /* No sd_journal_next()/_previous() here.
+ *
+ * At this point the read pointer is positioned after the newest/before the oldest entry in the whole
+ * journal. The next invocation of _previous()/_next() will hence position us at the newest/oldest
+ * entry we have. */
+
+ sd_id128_t boot_id = SD_ID128_NULL;
+ for (int off = !advance_older; ; off += advance_older ? -1 : 1) {
+ BootId boot;
+
+ r = discover_next_boot(j, boot_id, advance_older, &boot);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ *ret = SD_ID128_NULL;
+ return false;
+ }
+
+ boot_id = boot.id;
+ log_debug("Found boot ID %s by offset %i", SD_ID128_TO_STRING(boot_id), off);
+
+ if (off == offset)
+ break;
+ }
+
+ *ret = boot_id;
+ return true;
+}
+
+int journal_get_boots(sd_journal *j, BootId **ret_boots, size_t *ret_n_boots) {
+ _cleanup_free_ BootId *boots = NULL;
+ size_t n_boots = 0;
+ int r;
+
+ assert(j);
+ assert(ret_boots);
+ assert(ret_n_boots);
+
+ r = sd_journal_seek_head(j); /* seek to oldest */
+ if (r < 0)
+ return r;
+
+ /* No sd_journal_next()/_previous() here.
+ *
+ * At this point the read pointer is positioned before the oldest entry in the whole journal. The
+ * next invocation of _next() will hence position us at the oldest entry we have. */
+
+ sd_id128_t previous_boot_id = SD_ID128_NULL;
+ for (;;) {
+ BootId boot;
+
+ r = discover_next_boot(j, previous_boot_id, /* advance_older = */ false, &boot);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ previous_boot_id = boot.id;
+
+ FOREACH_ARRAY(i, boots, n_boots)
+ if (sd_id128_equal(i->id, boot.id))
+ /* The boot id is already stored, something wrong with the journal files.
+ * Exiting as otherwise this problem would cause an infinite loop. */
+ break;
+
+ if (!GREEDY_REALLOC(boots, n_boots + 1))
+ return -ENOMEM;
+
+ boots[n_boots++] = boot;
+ }
+
+ *ret_boots = TAKE_PTR(boots);
+ *ret_n_boots = n_boots;
+ return n_boots > 0;
+}
diff --git a/src/shared/logs-show.h b/src/shared/logs-show.h
new file mode 100644
index 0000000..3a8ce8b
--- /dev/null
+++ b/src/shared/logs-show.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#include "sd-id128.h"
+#include "sd-journal.h"
+
+#include "macro.h"
+#include "output-mode.h"
+#include "time-util.h"
+
+typedef struct BootId {
+ sd_id128_t id;
+ usec_t first_usec;
+ usec_t last_usec;
+} BootId;
+
+int show_journal_entry(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ OutputFlags flags,
+ Set *output_fields,
+ const size_t highlight[2],
+ bool *ellipsized,
+ dual_timestamp *previous_display_ts,
+ sd_id128_t *previous_boot_id);
+int show_journal(
+ FILE *f,
+ sd_journal *j,
+ OutputMode mode,
+ unsigned n_columns,
+ usec_t not_before,
+ unsigned how_many,
+ OutputFlags flags,
+ bool *ellipsized);
+
+int add_match_boot_id(sd_journal *j, sd_id128_t id);
+int add_match_this_boot(sd_journal *j, const char *machine);
+
+int add_matches_for_unit(
+ sd_journal *j,
+ const char *unit);
+
+int add_matches_for_user_unit(
+ sd_journal *j,
+ const char *unit,
+ uid_t uid);
+
+int show_journal_by_unit(
+ FILE *f,
+ const char *unit,
+ const char *namespace,
+ OutputMode mode,
+ unsigned n_columns,
+ usec_t not_before,
+ unsigned how_many,
+ uid_t uid,
+ OutputFlags flags,
+ int journal_open_flags,
+ bool system_unit,
+ bool *ellipsized);
+
+void json_escape(
+ FILE *f,
+ const char* p,
+ size_t l,
+ OutputFlags flags);
+
+int journal_find_boot_by_id(sd_journal *j, sd_id128_t boot_id);
+int journal_find_boot_by_offset(sd_journal *j, int offset, sd_id128_t *ret);
+int journal_get_boots(sd_journal *j, BootId **ret_boots, size_t *ret_n_boots);
diff --git a/src/shared/loop-util.c b/src/shared/loop-util.c
new file mode 100644
index 0000000..5860303
--- /dev/null
+++ b/src/shared/loop-util.c
@@ -0,0 +1,1209 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#if HAVE_VALGRIND_MEMCHECK_H
+#include <valgrind/memcheck.h>
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/blkpg.h>
+#include <linux/fs.h>
+#include <linux/loop.h>
+#include <sys/file.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#include "sd-device.h"
+
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "data-fd-util.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "dissect-image.h"
+#include "env-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "fileio.h"
+#include "loop-util.h"
+#include "missing_loop.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+
+static void cleanup_clear_loop_close(int *fd) {
+ if (*fd < 0)
+ return;
+
+ (void) ioctl(*fd, LOOP_CLR_FD);
+ (void) safe_close(*fd);
+}
+
+static int loop_is_bound(int fd) {
+ struct loop_info64 info;
+
+ if (ioctl(ASSERT_FD(fd), LOOP_GET_STATUS64, &info) < 0) {
+ if (errno == ENXIO)
+ return false; /* not bound! */
+
+ return -errno;
+ }
+
+ return true; /* bound! */
+}
+
+static int get_current_uevent_seqnum(uint64_t *ret) {
+ _cleanup_free_ char *p = NULL;
+ int r;
+
+ r = read_full_virtual_file("/sys/kernel/uevent_seqnum", &p, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read current uevent sequence number: %m");
+
+ r = safe_atou64(strstrip(p), ret);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse current uevent sequence number: %s", p);
+
+ return 0;
+}
+
+static int open_lock_fd(int primary_fd, int operation) {
+ _cleanup_close_ int lock_fd = -EBADF;
+
+ assert(IN_SET(operation & ~LOCK_NB, LOCK_SH, LOCK_EX));
+
+ lock_fd = fd_reopen(ASSERT_FD(primary_fd), O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
+ if (lock_fd < 0)
+ return lock_fd;
+
+ if (flock(lock_fd, operation) < 0)
+ return -errno;
+
+ return TAKE_FD(lock_fd);
+}
+
+static int loop_configure_verify_direct_io(int fd, const struct loop_config *c) {
+ assert(fd >= 0);
+ assert(c);
+
+ if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO)) {
+ struct loop_info64 info;
+
+ if (ioctl(fd, LOOP_GET_STATUS64, &info) < 0)
+ return log_debug_errno(errno, "Failed to issue LOOP_GET_STATUS64: %m");
+
+#if HAVE_VALGRIND_MEMCHECK_H
+ VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
+#endif
+
+ /* On older kernels (<= 5.3) it was necessary to set the block size of the loopback block
+ * device to the logical block size of the underlying file system. Since there was no nice
+ * way to query the value, we are not bothering to do this however. On newer kernels the
+ * block size is propagated automatically and does not require intervention from us. We'll
+ * check here if enabling direct IO worked, to make this easily debuggable however.
+ *
+ * (Should anyone really care and actually wants direct IO on old kernels: it might be worth
+ * enabling direct IO with iteratively larger block sizes until it eventually works.)
+ *
+ * On older kernels (e.g.: 5.10) when this is attempted on a file stored on a dm-crypt
+ * backed partition the kernel will start returning I/O errors when accessing the mounted
+ * loop device, so return a recognizable error that causes the operation to be started
+ * from scratch without the LO_FLAGS_DIRECT_IO flag. */
+ if (!FLAGS_SET(info.lo_flags, LO_FLAGS_DIRECT_IO))
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ENOANO),
+ "Could not enable direct IO mode, retrying in buffered IO mode.");
+ }
+
+ return 0;
+}
+
+static int loop_configure_verify(int fd, const struct loop_config *c) {
+ bool broken = false;
+ int r;
+
+ assert(fd >= 0);
+ assert(c);
+
+ if (c->block_size != 0) {
+ uint32_t ssz;
+
+ r = blockdev_get_sector_size(fd, &ssz);
+ if (r < 0)
+ return r;
+
+ if (ssz != c->block_size) {
+ log_debug("LOOP_CONFIGURE didn't honour requested block size %" PRIu32 ", got %" PRIu32 " instead. Ignoring.", c->block_size, ssz);
+ broken = true;
+ }
+ }
+
+ if (c->info.lo_sizelimit != 0) {
+ /* Kernel 5.8 vanilla doesn't properly propagate the size limit into the
+ * block device. If it's used, let's immediately check if it had the desired
+ * effect hence. And if not use classic LOOP_SET_STATUS64. */
+ uint64_t z;
+
+ if (ioctl(fd, BLKGETSIZE64, &z) < 0)
+ return -errno;
+
+ if (z != c->info.lo_sizelimit) {
+ log_debug("LOOP_CONFIGURE is broken, doesn't honour .info.lo_sizelimit. Falling back to LOOP_SET_STATUS64.");
+ broken = true;
+ }
+ }
+
+ if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_PARTSCAN)) {
+ /* Kernel 5.8 vanilla doesn't properly propagate the partition scanning flag
+ * into the block device. Let's hence verify if things work correctly here
+ * before returning. */
+
+ r = blockdev_partscan_enabled(fd);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("LOOP_CONFIGURE is broken, doesn't honour LO_FLAGS_PARTSCAN. Falling back to LOOP_SET_STATUS64.");
+ broken = true;
+ }
+ }
+
+ r = loop_configure_verify_direct_io(fd, c);
+ if (r < 0)
+ return r;
+
+ return !broken;
+}
+
+static int loop_configure_fallback(int fd, const struct loop_config *c) {
+ struct loop_info64 info_copy;
+ int r;
+
+ assert(fd >= 0);
+ assert(c);
+
+ /* Only some of the flags LOOP_CONFIGURE can set are also settable via LOOP_SET_STATUS64, hence mask
+ * them out. */
+ info_copy = c->info;
+ info_copy.lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
+
+ /* Since kernel commit 5db470e229e22b7eda6e23b5566e532c96fb5bc3 (kernel v5.0) the LOOP_SET_STATUS64
+ * ioctl can return EAGAIN in case we change the info.lo_offset field, if someone else is accessing the
+ * block device while we try to reconfigure it. This is a pretty common case, since udev might
+ * instantly start probing the device as soon as we attach an fd to it. Hence handle it in two ways:
+ * first, let's take the BSD lock to ensure that udev will not step in between the point in
+ * time where we attach the fd and where we reconfigure the device. Secondly, let's wait 50ms on
+ * EAGAIN and retry. The former should be an efficient mechanism to avoid we have to wait 50ms
+ * needlessly if we are just racing against udev. The latter is protection against all other cases,
+ * i.e. peers that do not take the BSD lock. */
+
+ for (unsigned n_attempts = 0;;) {
+ if (ioctl(fd, LOOP_SET_STATUS64, &info_copy) >= 0)
+ break;
+
+ if (errno != EAGAIN || ++n_attempts >= 64)
+ return log_debug_errno(errno, "Failed to configure loopback block device: %m");
+
+ /* Sleep some random time, but at least 10ms, at most 250ms. Increase the delay the more
+ * failed attempts we see */
+ (void) usleep_safe(UINT64_C(10) * USEC_PER_MSEC +
+ random_u64_range(UINT64_C(240) * USEC_PER_MSEC * n_attempts/64));
+ }
+
+ /* Work around a kernel bug, where changing offset/size of the loopback device doesn't correctly
+ * invalidate the buffer cache. For details see:
+ *
+ * https://android.googlesource.com/platform/system/apex/+/bef74542fbbb4cd629793f4efee8e0053b360570
+ *
+ * This was fixed in kernel 5.0, see:
+ *
+ * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5db470e229e22b7eda6e23b5566e532c96fb5bc3
+ *
+ * We'll run the work-around here in the legacy LOOP_SET_STATUS64 codepath. In the LOOP_CONFIGURE
+ * codepath above it should not be necessary. */
+ if (c->info.lo_offset != 0 || c->info.lo_sizelimit != 0)
+ if (ioctl(fd, BLKFLSBUF, 0) < 0)
+ log_debug_errno(errno, "Failed to issue BLKFLSBUF ioctl, ignoring: %m");
+
+ /* If a block size is requested then try to configure it. If that doesn't work, ignore errors, but
+ * afterwards, let's validate what is in effect, and if it doesn't match what we want, fail */
+ if (c->block_size != 0) {
+ uint32_t ssz;
+
+ if (ioctl(fd, LOOP_SET_BLOCK_SIZE, (unsigned long) c->block_size) < 0)
+ log_debug_errno(errno, "Failed to set sector size, ignoring: %m");
+
+ r = blockdev_get_sector_size(fd, &ssz);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read sector size: %m");
+ if (ssz != c->block_size)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Sector size of loopback device doesn't match what we requested, refusing.");
+ }
+
+ /* LO_FLAGS_DIRECT_IO is a flags we need to configure via explicit ioctls. */
+ if (FLAGS_SET(c->info.lo_flags, LO_FLAGS_DIRECT_IO))
+ if (ioctl(fd, LOOP_SET_DIRECT_IO, 1UL) < 0)
+ log_debug_errno(errno, "Failed to enable direct IO mode, ignoring: %m");
+
+ return loop_configure_verify_direct_io(fd, c);
+}
+
+static int loop_configure(
+ int nr,
+ int open_flags,
+ int lock_op,
+ const struct loop_config *c,
+ LoopDevice **ret) {
+
+ static bool loop_configure_broken = false;
+
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ _cleanup_(cleanup_clear_loop_close) int loop_with_fd = -EBADF; /* This must be declared before lock_fd. */
+ _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF;
+ _cleanup_free_ char *node = NULL;
+ uint64_t diskseq = 0, seqnum = UINT64_MAX;
+ usec_t timestamp = USEC_INFINITY;
+ dev_t devno;
+ int r;
+
+ assert(nr >= 0);
+ assert(c);
+ assert(ret);
+
+ if (asprintf(&node, "/dev/loop%i", nr) < 0)
+ return log_oom_debug();
+
+ r = sd_device_new_from_devname(&dev, node);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create sd_device object for \"%s\": %m", node);
+
+ r = sd_device_get_devnum(dev, &devno);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to get devnum: %m");
+
+ fd = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags);
+ if (fd < 0)
+ return log_device_debug_errno(dev, fd, "Failed to open device: %m");
+
+ /* Let's lock the device before we do anything. We take the BSD lock on a second, separately opened
+ * fd for the device. udev after all watches for close() events (specifically IN_CLOSE_WRITE) on
+ * block devices to reprobe them, hence by having a separate fd we will later close() we can ensure
+ * we trigger udev after everything is done. If we'd lock our own fd instead and keep it open for a
+ * long time udev would possibly never run on it again, even though the fd is unlocked, simply
+ * because we never close() it. It also has the nice benefit we can use the _cleanup_close_ logic to
+ * automatically release the lock, after we are done. */
+ lock_fd = open_lock_fd(fd, LOCK_EX);
+ if (lock_fd < 0)
+ return log_device_debug_errno(dev, lock_fd, "Failed to acquire lock: %m");
+
+ log_device_debug(dev, "Acquired exclusive lock.");
+
+ /* Let's see if backing file is really unattached. Someone may already attach a backing file without
+ * taking BSD lock. */
+ r = loop_is_bound(fd);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to check if the loopback block device is bound: %m");
+ if (r > 0)
+ return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EBUSY),
+ "The loopback block device is already bound, ignoring.");
+
+ /* Let's see if the device is really detached, i.e. currently has no associated partition block
+ * devices. On various kernels (such as 5.8) it is possible to have a loopback block device that
+ * superficially is detached but still has partition block devices associated for it. Let's then
+ * manually remove the partitions via BLKPG, and tell the caller we did that via EUCLEAN, so they try
+ * again. */
+ r = block_device_remove_all_partitions(dev, fd);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to remove partitions on the loopback block device: %m");
+ if (r > 0)
+ /* Removed all partitions. Let's report this to the caller, to try again, and count this as
+ * an attempt. */
+ return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EUCLEAN),
+ "Removed partitions on the loopback block device.");
+
+ if (!loop_configure_broken) {
+ /* Acquire uevent seqnum immediately before attaching the loopback device. This allows
+ * callers to ignore all uevents with a seqnum before this one, if they need to associate
+ * uevent with this attachment. Doing so isn't race-free though, as uevents that happen in
+ * the window between this reading of the seqnum, and the LOOP_CONFIGURE call might still be
+ * mistaken as originating from our attachment, even though might be caused by an earlier
+ * use. But doing this at least shortens the race window a bit. */
+ r = get_current_uevent_seqnum(&seqnum);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to get the current uevent seqnum: %m");
+
+ timestamp = now(CLOCK_MONOTONIC);
+
+ if (ioctl(fd, LOOP_CONFIGURE, c) < 0) {
+ /* Do fallback only if LOOP_CONFIGURE is not supported, propagate all other
+ * errors. Note that the kernel is weird: non-existing ioctls currently return EINVAL
+ * rather than ENOTTY on loopback block devices. They should fix that in the kernel,
+ * but in the meantime we accept both here. */
+ if (!ERRNO_IS_NOT_SUPPORTED(errno) && errno != EINVAL)
+ return log_device_debug_errno(dev, errno, "ioctl(LOOP_CONFIGURE) failed: %m");
+
+ loop_configure_broken = true;
+ } else {
+ loop_with_fd = TAKE_FD(fd);
+
+ r = loop_configure_verify(loop_with_fd, c);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to verify if loopback block device is correctly configured: %m");
+ if (r == 0) {
+ /* LOOP_CONFIGURE doesn't work. Remember that. */
+ loop_configure_broken = true;
+
+ /* We return EBUSY here instead of retrying immediately with LOOP_SET_FD,
+ * because LOOP_CLR_FD is async: if the operation cannot be executed right
+ * away it just sets the autoclear flag on the device. This means there's a
+ * good chance we cannot actually reuse the loopback device right-away. Hence
+ * let's assume it's busy, avoid the trouble and let the calling loop call us
+ * again with a new, likely unused device. */
+ return -EBUSY;
+ }
+ }
+ }
+
+ if (loop_configure_broken) {
+ /* Let's read the seqnum again, to shorten the window. */
+ r = get_current_uevent_seqnum(&seqnum);
+ if (r < 0)
+ return log_device_debug_errno(dev, r, "Failed to get the current uevent seqnum: %m");
+
+ timestamp = now(CLOCK_MONOTONIC);
+
+ if (ioctl(fd, LOOP_SET_FD, c->fd) < 0)
+ return log_device_debug_errno(dev, errno, "ioctl(LOOP_SET_FD) failed: %m");
+
+ loop_with_fd = TAKE_FD(fd);
+
+ r = loop_configure_fallback(loop_with_fd, c);
+ if (r < 0)
+ return r;
+ }
+
+ r = fd_get_diskseq(loop_with_fd, &diskseq);
+ if (r < 0 && r != -EOPNOTSUPP)
+ return log_device_debug_errno(dev, r, "Failed to get diskseq: %m");
+
+ switch (lock_op & ~LOCK_NB) {
+ case LOCK_EX: /* Already in effect */
+ break;
+ case LOCK_SH: /* Downgrade */
+ if (flock(lock_fd, lock_op) < 0)
+ return log_device_debug_errno(dev, errno, "Failed to downgrade lock level: %m");
+ break;
+ case LOCK_UN: /* Release */
+ lock_fd = safe_close(lock_fd);
+ break;
+ default:
+ assert_not_reached();
+ }
+
+ LoopDevice *d = new(LoopDevice, 1);
+ if (!d)
+ return log_oom_debug();
+
+ *d = (LoopDevice) {
+ .n_ref = 1,
+ .fd = TAKE_FD(loop_with_fd),
+ .lock_fd = TAKE_FD(lock_fd),
+ .node = TAKE_PTR(node),
+ .nr = nr,
+ .devno = devno,
+ .dev = TAKE_PTR(dev),
+ .diskseq = diskseq,
+ .uevent_seqnum_not_before = seqnum,
+ .timestamp_not_before = timestamp,
+ .sector_size = c->block_size,
+ };
+
+ *ret = TAKE_PTR(d);
+ return 0;
+}
+
+static int loop_device_make_internal(
+ const char *path,
+ int fd,
+ int open_flags,
+ uint64_t offset,
+ uint64_t size,
+ uint32_t sector_size,
+ uint32_t loop_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ _cleanup_(loop_device_unrefp) LoopDevice *d = NULL;
+ _cleanup_close_ int reopened_fd = -EBADF, control = -EBADF;
+ _cleanup_free_ char *backing_file = NULL;
+ struct loop_config config;
+ int r, f_flags;
+ struct stat st;
+
+ assert(ret);
+ assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
+
+ if (fstat(ASSERT_FD(fd), &st) < 0)
+ return -errno;
+
+ if (S_ISBLK(st.st_mode)) {
+ if (offset == 0 && IN_SET(size, 0, UINT64_MAX))
+ /* If this is already a block device and we are supposed to cover the whole of it
+ * then store an fd to the original open device node — and do not actually create an
+ * unnecessary loopback device for it. */
+ return loop_device_open_from_fd(fd, open_flags, lock_op, ret);
+ } else {
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return r;
+ }
+
+ if (path) {
+ r = path_make_absolute_cwd(path, &backing_file);
+ if (r < 0)
+ return r;
+
+ path_simplify(backing_file);
+ } else {
+ r = fd_get_path(fd, &backing_file);
+ if (r < 0)
+ return r;
+ }
+
+ f_flags = fcntl(fd, F_GETFL);
+ if (f_flags < 0)
+ return -errno;
+
+ if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) != FLAGS_SET(f_flags, O_DIRECT)) {
+ /* If LO_FLAGS_DIRECT_IO is requested, then make sure we have the fd open with O_DIRECT, as
+ * that's required. Conversely, if it's off require that O_DIRECT is off too (that's because
+ * new kernels will implicitly enable LO_FLAGS_DIRECT_IO if O_DIRECT is set).
+ *
+ * Our intention here is that LO_FLAGS_DIRECT_IO is the primary knob, and O_DIRECT derived
+ * from that automatically. */
+
+ reopened_fd = fd_reopen(fd, (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0)|O_CLOEXEC|O_NONBLOCK|open_flags);
+ if (reopened_fd < 0) {
+ if (!FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO))
+ return log_debug_errno(reopened_fd, "Failed to reopen file descriptor without O_DIRECT: %m");
+
+ /* Some file systems might not support O_DIRECT, let's gracefully continue without it then. */
+ log_debug_errno(reopened_fd, "Failed to enable O_DIRECT for backing file descriptor for loopback device. Continuing without.");
+ loop_flags &= ~LO_FLAGS_DIRECT_IO;
+ } else
+ fd = reopened_fd; /* From now on, operate on our new O_DIRECT fd */
+ }
+
+ control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+ if (control < 0)
+ return -errno;
+
+ if (sector_size == 0)
+ /* If no sector size is specified, default to the classic default */
+ sector_size = 512;
+ else if (sector_size == UINT32_MAX) {
+
+ if (S_ISBLK(st.st_mode))
+ /* If the sector size is specified as UINT32_MAX we'll propagate the sector size of
+ * the underlying block device. */
+ r = blockdev_get_sector_size(fd, &sector_size);
+ else {
+ _cleanup_close_ int non_direct_io_fd = -EBADF;
+ int probe_fd;
+
+ assert(S_ISREG(st.st_mode));
+
+ /* If sector size is specified as UINT32_MAX, we'll try to probe the right sector
+ * size of the image in question by looking for the GPT partition header at various
+ * offsets. This of course only works if the image already has a disk label.
+ *
+ * So here we actually want to read the file contents ourselves. This is quite likely
+ * not going to work if we managed to enable O_DIRECT, because in such a case there
+ * are some pretty strict alignment requirements to offset, size and target, but
+ * there's no way to query what alignment specifically is actually required. Hence,
+ * let's avoid the mess, and temporarily open an fd without O_DIRECT for the probing
+ * logic. */
+
+ if (FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO)) {
+ non_direct_io_fd = fd_reopen(fd, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
+ if (non_direct_io_fd < 0)
+ return non_direct_io_fd;
+
+ probe_fd = non_direct_io_fd;
+ } else
+ probe_fd = fd;
+
+ r = probe_sector_size(probe_fd, &sector_size);
+ }
+ if (r < 0)
+ return r;
+ }
+
+ config = (struct loop_config) {
+ .fd = fd,
+ .block_size = sector_size,
+ .info = {
+ /* Use the specified flags, but configure the read-only flag from the open flags, and force autoclear */
+ .lo_flags = (loop_flags & ~LO_FLAGS_READ_ONLY) | ((open_flags & O_ACCMODE) == O_RDONLY ? LO_FLAGS_READ_ONLY : 0) | LO_FLAGS_AUTOCLEAR,
+ .lo_offset = offset,
+ .lo_sizelimit = size == UINT64_MAX ? 0 : size,
+ },
+ };
+
+ /* Loop around LOOP_CTL_GET_FREE, since at the moment we attempt to open the returned device it might
+ * be gone already, taken by somebody else racing against us. */
+ for (unsigned n_attempts = 0;;) {
+ usec_t usec;
+ int nr;
+
+ /* Let's take a lock on the control device first. On a busy system, where many programs
+ * attempt to allocate a loopback device at the same time, we might otherwise keep looping
+ * around relatively heavy operations: asking for a free loopback device, then opening it,
+ * validating it, attaching something to it. Let's serialize this whole operation, to make
+ * unnecessary busywork less likely. Note that this is just something we do to optimize our
+ * own code (and whoever else decides to use LOCK_EX locks for this), taking this lock is not
+ * necessary, it just means it's less likely we have to iterate through this loop again and
+ * again if our own code races against our own code.
+ *
+ * Note: our lock protocol is to take the /dev/loop-control lock first, and the block device
+ * lock second, if both are taken, and always in this order, to avoid ABBA locking issues. */
+ if (flock(control, LOCK_EX) < 0)
+ return -errno;
+
+ nr = ioctl(control, LOOP_CTL_GET_FREE);
+ if (nr < 0)
+ return -errno;
+
+ r = loop_configure(nr, open_flags, lock_op, &config, &d);
+ if (r >= 0)
+ break;
+
+ /* -ENODEV or friends: Somebody might've gotten the same number from the kernel, used the
+ * device, and called LOOP_CTL_REMOVE on it. Let's retry with a new number.
+ * -EBUSY: a file descriptor is already bound to the loopback block device.
+ * -EUCLEAN: some left-over partition devices that were cleaned up.
+ * -ENOANO: we tried to use LO_FLAGS_DIRECT_IO but the kernel rejected it. */
+ if (!ERRNO_IS_DEVICE_ABSENT(r) && !IN_SET(r, -EBUSY, -EUCLEAN, -ENOANO))
+ return r;
+
+ /* OK, this didn't work, let's try again a bit later, but first release the lock on the
+ * control device */
+ if (flock(control, LOCK_UN) < 0)
+ return -errno;
+
+ if (++n_attempts >= 64) /* Give up eventually */
+ return -EBUSY;
+
+ /* If we failed to enable direct IO mode, let's retry without it. We restart the process as
+ * on some combination of kernel version and storage filesystem, the kernel is very unhappy
+ * about a failed DIRECT_IO enablement and throws I/O errors. */
+ if (r == -ENOANO && FLAGS_SET(config.info.lo_flags, LO_FLAGS_DIRECT_IO)) {
+ config.info.lo_flags &= ~LO_FLAGS_DIRECT_IO;
+ open_flags &= ~O_DIRECT;
+
+ int non_direct_io_fd = fd_reopen(config.fd, O_CLOEXEC|O_NONBLOCK|open_flags);
+ if (non_direct_io_fd < 0)
+ return log_debug_errno(
+ non_direct_io_fd,
+ "Failed to reopen file descriptor without O_DIRECT: %m");
+
+ safe_close(reopened_fd);
+ fd = config.fd = /* For cleanups */ reopened_fd = non_direct_io_fd;
+ }
+
+ /* Wait some random time, to make collision less likely. Let's pick a random time in the
+ * range 0ms…250ms, linearly scaled by the number of failed attempts. */
+ usec = random_u64_range(UINT64_C(10) * USEC_PER_MSEC +
+ UINT64_C(240) * USEC_PER_MSEC * n_attempts/64);
+ log_debug("Trying again after %s.", FORMAT_TIMESPAN(usec, USEC_PER_MSEC));
+ (void) usleep_safe(usec);
+ }
+
+ d->backing_file = TAKE_PTR(backing_file);
+ d->backing_inode = st.st_ino;
+ d->backing_devno = st.st_dev;
+
+ log_debug("Successfully acquired %s, devno=%u:%u, nr=%i, diskseq=%" PRIu64,
+ d->node,
+ major(d->devno), minor(d->devno),
+ d->nr,
+ d->diskseq);
+
+ *ret = TAKE_PTR(d);
+ return 0;
+}
+
+static uint32_t loop_flags_mangle(uint32_t loop_flags) {
+ int r;
+
+ r = getenv_bool("SYSTEMD_LOOP_DIRECT_IO");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_LOOP_DIRECT_IO, ignoring: %m");
+
+ return UPDATE_FLAG(loop_flags, LO_FLAGS_DIRECT_IO, r != 0); /* Turn on LO_FLAGS_DIRECT_IO by default, unless explicitly configured to off. */
+}
+
+int loop_device_make(
+ int fd,
+ int open_flags,
+ uint64_t offset,
+ uint64_t size,
+ uint32_t sector_size,
+ uint32_t loop_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ assert(fd >= 0);
+ assert(ret);
+
+ return loop_device_make_internal(
+ NULL,
+ fd,
+ open_flags,
+ offset,
+ size,
+ sector_size,
+ loop_flags_mangle(loop_flags),
+ lock_op,
+ ret);
+}
+
+int loop_device_make_by_path_at(
+ int dir_fd,
+ const char *path,
+ int open_flags,
+ uint32_t sector_size,
+ uint32_t loop_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ int r, basic_flags, direct_flags, rdwr_flags;
+ _cleanup_close_ int fd = -EBADF;
+ bool direct = false;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(path);
+ assert(ret);
+ assert(open_flags < 0 || IN_SET(open_flags, O_RDWR, O_RDONLY));
+
+ /* Passing < 0 as open_flags here means we'll try to open the device writable if we can, retrying
+ * read-only if we cannot. */
+
+ loop_flags = loop_flags_mangle(loop_flags);
+
+ /* Let's open with O_DIRECT if we can. But not all file systems support that, hence fall back to
+ * non-O_DIRECT mode automatically, if it fails. */
+
+ basic_flags = O_CLOEXEC|O_NONBLOCK|O_NOCTTY;
+ direct_flags = FLAGS_SET(loop_flags, LO_FLAGS_DIRECT_IO) ? O_DIRECT : 0;
+ rdwr_flags = open_flags >= 0 ? open_flags : O_RDWR;
+
+ fd = xopenat(dir_fd, path, basic_flags|direct_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0);
+ if (fd < 0 && direct_flags != 0) /* If we had O_DIRECT on, and things failed with that, let's immediately try again without */
+ fd = xopenat(dir_fd, path, basic_flags|rdwr_flags, /* xopen_flags = */ 0, /* mode = */ 0);
+ else
+ direct = direct_flags != 0;
+ if (fd < 0) {
+ r = fd;
+
+ /* Retry read-only? */
+ if (open_flags >= 0 || !(ERRNO_IS_PRIVILEGE(r) || r == -EROFS))
+ return r;
+
+ fd = xopenat(dir_fd, path, basic_flags|direct_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0);
+ if (fd < 0 && direct_flags != 0) /* as above */
+ fd = xopenat(dir_fd, path, basic_flags|O_RDONLY, /* xopen_flags = */ 0, /* mode = */ 0);
+ else
+ direct = direct_flags != 0;
+ if (fd < 0)
+ return r; /* Propagate original error */
+
+ open_flags = O_RDONLY;
+ } else if (open_flags < 0)
+ open_flags = O_RDWR;
+
+ log_debug("Opened '%s' in %s access mode%s, with O_DIRECT %s%s.",
+ path,
+ open_flags == O_RDWR ? "O_RDWR" : "O_RDONLY",
+ open_flags != rdwr_flags ? " (O_RDWR was requested but not allowed)" : "",
+ direct ? "enabled" : "disabled",
+ direct != (direct_flags != 0) ? " (O_DIRECT was requested but not supported)" : "");
+
+ return loop_device_make_internal(
+ dir_fd == AT_FDCWD ? path : NULL,
+ fd,
+ open_flags,
+ /* offset = */ 0,
+ /* size = */ 0,
+ sector_size,
+ loop_flags,
+ lock_op,
+ ret);
+}
+
+int loop_device_make_by_path_memory(
+ const char *path,
+ int open_flags,
+ uint32_t sector_size,
+ uint32_t loop_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ _cleanup_close_ int fd = -EBADF, mfd = -EBADF;
+ _cleanup_free_ char *fn = NULL;
+ struct stat st;
+ int r;
+
+ assert(path);
+ assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
+ assert(ret);
+
+ loop_flags &= ~LO_FLAGS_DIRECT_IO; /* memfds don't support O_DIRECT, hence LO_FLAGS_DIRECT_IO can't be used either */
+
+ fd = open(path, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|O_RDONLY);
+ if (fd < 0)
+ return -errno;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISREG(st.st_mode) && !S_ISBLK(st.st_mode))
+ return -EBADF;
+
+ r = path_extract_filename(path, &fn);
+ if (r < 0)
+ return r;
+
+ mfd = memfd_clone_fd(fd, fn, open_flags|O_CLOEXEC);
+ if (mfd < 0)
+ return mfd;
+
+ fd = safe_close(fd); /* Let's close the original early */
+
+ return loop_device_make_internal(NULL, mfd, open_flags, 0, 0, sector_size, loop_flags, lock_op, ret);
+}
+
+static LoopDevice* loop_device_free(LoopDevice *d) {
+ _cleanup_close_ int control = -EBADF;
+ int r;
+
+ if (!d)
+ return NULL;
+
+ /* Release any lock we might have on the device first. We want to open+lock the /dev/loop-control
+ * device below, but our lock protocol says that if both control and block device locks are taken,
+ * the control lock needs to be taken first, the block device lock second — in order to avoid ABBA
+ * locking issues. Moreover, we want to issue LOOP_CLR_FD on the block device further down, and that
+ * would fail if we had another fd open to the device. */
+ d->lock_fd = safe_close(d->lock_fd);
+
+ /* Let's open the control device early, and lock it, so that we can release our block device and
+ * delete it in a synchronized fashion, and allocators won't needlessly see the block device as free
+ * while we are about to delete it. */
+ if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) {
+ control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+ if (control < 0)
+ log_debug_errno(errno, "Failed to open loop control device, cannot remove loop device '%s', ignoring: %m", strna(d->node));
+ else if (flock(control, LOCK_EX) < 0)
+ log_debug_errno(errno, "Failed to lock loop control device, ignoring: %m");
+ }
+
+ /* Then let's release the loopback block device */
+ if (d->fd >= 0) {
+ /* Implicitly sync the device, since otherwise in-flight blocks might not get written */
+ if (fsync(d->fd) < 0)
+ log_debug_errno(errno, "Failed to sync loop block device, ignoring: %m");
+
+ if (!LOOP_DEVICE_IS_FOREIGN(d) && !d->relinquished) {
+ /* We are supposed to clear the loopback device. Let's do this synchronously: lock
+ * the device, manually remove all partitions and then clear it. This should ensure
+ * udev doesn't concurrently access the devices, and we can be reasonably sure that
+ * once we are done here the device is cleared and all its partition children
+ * removed. Note that we lock our primary device fd here (and not a separate locking
+ * fd, as we do during allocation, since we want to keep the lock all the way through
+ * the LOOP_CLR_FD, but that call would fail if we had more than one fd open.) */
+
+ if (flock(d->fd, LOCK_EX) < 0)
+ log_debug_errno(errno, "Failed to lock loop block device, ignoring: %m");
+
+ r = block_device_remove_all_partitions(d->dev, d->fd);
+ if (r < 0)
+ log_debug_errno(r, "Failed to remove partitions of loopback block device, ignoring: %m");
+
+ if (ioctl(d->fd, LOOP_CLR_FD) < 0)
+ log_debug_errno(errno, "Failed to clear loop device, ignoring: %m");
+ }
+
+ safe_close(d->fd);
+ }
+
+ /* Now that the block device is released, let's also try to remove it */
+ if (control >= 0) {
+ useconds_t delay = 5 * USEC_PER_MSEC; /* A total delay of 5090 ms between 39 attempts,
+ * (4*5 + 5*10 + 5*20 + … + 3*640) = 5090. */
+
+ for (unsigned attempt = 1;; attempt++) {
+ if (ioctl(control, LOOP_CTL_REMOVE, d->nr) >= 0)
+ break;
+ if (errno != EBUSY || attempt > 38) {
+ log_debug_errno(errno, "Failed to remove device %s: %m", strna(d->node));
+ break;
+ }
+ if (attempt % 5 == 0) {
+ log_debug("Device is still busy after %u attempts…", attempt);
+ delay *= 2;
+ }
+
+ (void) usleep_safe(delay);
+ }
+ }
+
+ free(d->node);
+ sd_device_unref(d->dev);
+ free(d->backing_file);
+ return mfree(d);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(LoopDevice, loop_device, loop_device_free);
+
+void loop_device_relinquish(LoopDevice *d) {
+ assert(d);
+
+ /* Don't attempt to clean up the loop device anymore from this point on. Leave the clean-ing up to the kernel
+ * itself, using the loop device "auto-clear" logic we already turned on when creating the device. */
+
+ d->relinquished = true;
+}
+
+void loop_device_unrelinquish(LoopDevice *d) {
+ assert(d);
+ d->relinquished = false;
+}
+
+int loop_device_open(
+ sd_device *dev,
+ int open_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ _cleanup_close_ int fd = -EBADF, lock_fd = -EBADF;
+ _cleanup_free_ char *node = NULL, *backing_file = NULL;
+ dev_t devnum, backing_devno = 0;
+ struct loop_info64 info;
+ ino_t backing_inode = 0;
+ uint64_t diskseq = 0;
+ LoopDevice *d;
+ const char *s;
+ int r, nr = -1;
+
+ assert(dev);
+ assert(IN_SET(open_flags, O_RDWR, O_RDONLY));
+ assert(ret);
+
+ /* Even if fd is provided through the argument in loop_device_open_from_fd(), we reopen the inode
+ * here, instead of keeping just a dup() clone of it around, since we want to ensure that the
+ * O_DIRECT flag of the handle we keep is off, we have our own file index, and have the right
+ * read/write mode in effect. */
+ fd = sd_device_open(dev, O_CLOEXEC|O_NONBLOCK|O_NOCTTY|open_flags);
+ if (fd < 0)
+ return fd;
+
+ if ((lock_op & ~LOCK_NB) != LOCK_UN) {
+ lock_fd = open_lock_fd(fd, lock_op);
+ if (lock_fd < 0)
+ return lock_fd;
+ }
+
+ if (ioctl(fd, LOOP_GET_STATUS64, &info) >= 0) {
+#if HAVE_VALGRIND_MEMCHECK_H
+ /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
+ VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
+#endif
+ nr = info.lo_number;
+
+ if (sd_device_get_sysattr_value(dev, "loop/backing_file", &s) >= 0) {
+ backing_file = strdup(s);
+ if (!backing_file)
+ return -ENOMEM;
+ }
+
+ backing_devno = info.lo_device;
+ backing_inode = info.lo_inode;
+ }
+
+ r = fd_get_diskseq(fd, &diskseq);
+ if (r < 0 && r != -EOPNOTSUPP)
+ return r;
+
+ uint32_t sector_size;
+ r = blockdev_get_sector_size(fd, &sector_size);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_devnum(dev, &devnum);
+ if (r < 0)
+ return r;
+
+ r = sd_device_get_devname(dev, &s);
+ if (r < 0)
+ return r;
+
+ node = strdup(s);
+ if (!node)
+ return -ENOMEM;
+
+ d = new(LoopDevice, 1);
+ if (!d)
+ return -ENOMEM;
+
+ *d = (LoopDevice) {
+ .n_ref = 1,
+ .fd = TAKE_FD(fd),
+ .lock_fd = TAKE_FD(lock_fd),
+ .nr = nr,
+ .node = TAKE_PTR(node),
+ .dev = sd_device_ref(dev),
+ .backing_file = TAKE_PTR(backing_file),
+ .backing_inode = backing_inode,
+ .backing_devno = backing_devno,
+ .relinquished = true, /* It's not ours, don't try to destroy it when this object is freed */
+ .devno = devnum,
+ .diskseq = diskseq,
+ .uevent_seqnum_not_before = UINT64_MAX,
+ .timestamp_not_before = USEC_INFINITY,
+ .sector_size = sector_size,
+ };
+
+ *ret = d;
+ return 0;
+}
+
+int loop_device_open_from_fd(
+ int fd,
+ int open_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ int r;
+
+ r = block_device_new_from_fd(ASSERT_FD(fd), 0, &dev);
+ if (r < 0)
+ return r;
+
+ return loop_device_open(dev, open_flags, lock_op, ret);
+}
+
+int loop_device_open_from_path(
+ const char *path,
+ int open_flags,
+ int lock_op,
+ LoopDevice **ret) {
+
+ _cleanup_(sd_device_unrefp) sd_device *dev = NULL;
+ int r;
+
+ assert(path);
+
+ r = block_device_new_from_path(path, 0, &dev);
+ if (r < 0)
+ return r;
+
+ return loop_device_open(dev, open_flags, lock_op, ret);
+}
+
+static int resize_partition(int partition_fd, uint64_t offset, uint64_t size) {
+ char sysfs[STRLEN("/sys/dev/block/:/partition") + 2*DECIMAL_STR_MAX(dev_t) + 1];
+ _cleanup_free_ char *buffer = NULL;
+ uint64_t current_offset, current_size, partno;
+ _cleanup_close_ int whole_fd = -EBADF;
+ struct stat st;
+ dev_t devno;
+ int r;
+
+ /* Resizes the partition the loopback device refer to (assuming it refers to one instead of an actual
+ * loopback device), and changes the offset, if needed. This is a fancy wrapper around
+ * BLKPG_RESIZE_PARTITION. */
+
+ if (fstat(ASSERT_FD(partition_fd), &st) < 0)
+ return -errno;
+
+ assert(S_ISBLK(st.st_mode));
+
+ xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/partition", DEVNUM_FORMAT_VAL(st.st_rdev));
+ r = read_one_line_file(sysfs, &buffer);
+ if (r == -ENOENT) /* not a partition, cannot resize */
+ return -ENOTTY;
+ if (r < 0)
+ return r;
+ r = safe_atou64(buffer, &partno);
+ if (r < 0)
+ return r;
+
+ xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/start", DEVNUM_FORMAT_VAL(st.st_rdev));
+
+ buffer = mfree(buffer);
+ r = read_one_line_file(sysfs, &buffer);
+ if (r < 0)
+ return r;
+ r = safe_atou64(buffer, &current_offset);
+ if (r < 0)
+ return r;
+ if (current_offset > UINT64_MAX/512U)
+ return -EINVAL;
+ current_offset *= 512U;
+
+ if (ioctl(partition_fd, BLKGETSIZE64, &current_size) < 0)
+ return -EINVAL;
+
+ if (size == UINT64_MAX && offset == UINT64_MAX)
+ return 0;
+ if (current_size == size && current_offset == offset)
+ return 0;
+
+ xsprintf(sysfs, "/sys/dev/block/" DEVNUM_FORMAT_STR "/../dev", DEVNUM_FORMAT_VAL(st.st_rdev));
+
+ buffer = mfree(buffer);
+ r = read_one_line_file(sysfs, &buffer);
+ if (r < 0)
+ return r;
+ r = parse_devnum(buffer, &devno);
+ if (r < 0)
+ return r;
+
+ whole_fd = r = device_open_from_devnum(S_IFBLK, devno, O_RDWR|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, NULL);
+ if (r < 0)
+ return r;
+
+ return block_device_resize_partition(
+ whole_fd,
+ partno,
+ offset == UINT64_MAX ? current_offset : offset,
+ size == UINT64_MAX ? current_size : size);
+}
+
+int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size) {
+ struct loop_info64 info;
+
+ assert(d);
+ assert(d->fd >= 0);
+
+ /* Changes the offset/start of the loop device relative to the beginning of the underlying file or
+ * block device. If this loop device actually refers to a partition and not a loopback device, we'll
+ * try to adjust the partition offsets instead.
+ *
+ * If either offset or size is UINT64_MAX we won't change that parameter. */
+
+ if (d->nr < 0) /* not a loopback device */
+ return resize_partition(d->fd, offset, size);
+
+ if (ioctl(d->fd, LOOP_GET_STATUS64, &info) < 0)
+ return -errno;
+
+#if HAVE_VALGRIND_MEMCHECK_H
+ /* Valgrind currently doesn't know LOOP_GET_STATUS64. Remove this once it does */
+ VALGRIND_MAKE_MEM_DEFINED(&info, sizeof(info));
+#endif
+
+ if (size == UINT64_MAX && offset == UINT64_MAX)
+ return 0;
+ if (info.lo_sizelimit == size && info.lo_offset == offset)
+ return 0;
+
+ if (size != UINT64_MAX)
+ info.lo_sizelimit = size;
+ if (offset != UINT64_MAX)
+ info.lo_offset = offset;
+
+ return RET_NERRNO(ioctl(d->fd, LOOP_SET_STATUS64, &info));
+}
+
+int loop_device_flock(LoopDevice *d, int operation) {
+ assert(IN_SET(operation & ~LOCK_NB, LOCK_UN, LOCK_SH, LOCK_EX));
+ assert(d);
+
+ /* When unlocking just close the lock fd */
+ if ((operation & ~LOCK_NB) == LOCK_UN) {
+ d->lock_fd = safe_close(d->lock_fd);
+ return 0;
+ }
+
+ /* If we had no lock fd so far, create one and lock it right-away */
+ if (d->lock_fd < 0) {
+ d->lock_fd = open_lock_fd(ASSERT_FD(d->fd), operation);
+ if (d->lock_fd < 0)
+ return d->lock_fd;
+
+ return 0;
+ }
+
+ /* Otherwise change the current lock mode on the existing fd */
+ return RET_NERRNO(flock(d->lock_fd, operation));
+}
+
+int loop_device_sync(LoopDevice *d) {
+ assert(d);
+
+ /* We also do this implicitly in loop_device_unref(). Doing this explicitly here has the benefit that
+ * we can check the return value though. */
+
+ return RET_NERRNO(fsync(ASSERT_FD(d->fd)));
+}
+
+int loop_device_set_autoclear(LoopDevice *d, bool autoclear) {
+ struct loop_info64 info;
+
+ assert(d);
+
+ if (ioctl(ASSERT_FD(d->fd), LOOP_GET_STATUS64, &info) < 0)
+ return -errno;
+
+ if (autoclear == FLAGS_SET(info.lo_flags, LO_FLAGS_AUTOCLEAR))
+ return 0;
+
+ SET_FLAG(info.lo_flags, LO_FLAGS_AUTOCLEAR, autoclear);
+
+ if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0)
+ return -errno;
+
+ return 1;
+}
+
+int loop_device_set_filename(LoopDevice *d, const char *name) {
+ struct loop_info64 info;
+
+ assert(d);
+
+ /* Sets the .lo_file_name of the loopback device. This is supposed to contain the path to the file
+ * backing the block device, but is actually just a free-form string you can pass to the kernel. Most
+ * tools that actually care for the backing file path use the sysfs attribute file loop/backing_file
+ * which is a kernel generated string, subject to file system namespaces and such.
+ *
+ * .lo_file_name is useful since userspace can select it freely when creating a loopback block
+ * device, and we can use it for /dev/disk/by-loop-ref/ symlinks, and similar, so that apps can
+ * recognize their own loopback files. */
+
+ if (name && strlen(name) >= sizeof(info.lo_file_name))
+ return -ENOBUFS;
+
+ if (ioctl(ASSERT_FD(d->fd), LOOP_GET_STATUS64, &info) < 0)
+ return -errno;
+
+ if (strneq((char*) info.lo_file_name, strempty(name), sizeof(info.lo_file_name)))
+ return 0;
+
+ if (name) {
+ strncpy((char*) info.lo_file_name, name, sizeof(info.lo_file_name)-1);
+ info.lo_file_name[sizeof(info.lo_file_name)-1] = 0;
+ } else
+ memzero(info.lo_file_name, sizeof(info.lo_file_name));
+
+ if (ioctl(d->fd, LOOP_SET_STATUS64, &info) < 0)
+ return -errno;
+
+ return 1;
+}
diff --git a/src/shared/loop-util.h b/src/shared/loop-util.h
new file mode 100644
index 0000000..d77c314
--- /dev/null
+++ b/src/shared/loop-util.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+
+#include "sd-device.h"
+
+#include "macro.h"
+#include "time-util.h"
+
+typedef struct LoopDevice LoopDevice;
+
+/* Some helpers for setting up loopback block devices */
+
+struct LoopDevice {
+ unsigned n_ref;
+ int fd;
+ int lock_fd;
+ int nr; /* The loopback device index (i.e. 4 for /dev/loop4); if this object encapsulates a non-loopback block device, set to -1 */
+ dev_t devno; /* The loopback device's own dev_t */
+ char *node;
+ sd_device *dev;
+ char *backing_file;
+ bool relinquished;
+ dev_t backing_devno; /* The backing file's dev_t */
+ ino_t backing_inode; /* The backing file's ino_t */
+ uint64_t diskseq; /* Block device sequence number, monothonically incremented by the kernel on create/attach, or 0 if we don't know */
+ uint64_t uevent_seqnum_not_before; /* uevent sequm right before we attached the loopback device, or UINT64_MAX if we don't know */
+ usec_t timestamp_not_before; /* CLOCK_MONOTONIC timestamp taken immediately before attaching the loopback device, or USEC_INFINITY if we don't know */
+ uint32_t sector_size;
+};
+
+/* Returns true if LoopDevice object is not actually a loopback device but some other block device we just wrap */
+#define LOOP_DEVICE_IS_FOREIGN(d) ((d)->nr < 0)
+
+int loop_device_make(int fd, int open_flags, uint64_t offset, uint64_t size, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret);
+int loop_device_make_by_path_at(int dir_fd, const char *path, int open_flags, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret);
+static inline int loop_device_make_by_path(const char *path, int open_flags, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret) {
+ return loop_device_make_by_path_at(AT_FDCWD, path, open_flags, sector_size, loop_flags, lock_op, ret);
+}
+int loop_device_make_by_path_memory(const char *path, int open_flags, uint32_t sector_size, uint32_t loop_flags, int lock_op, LoopDevice **ret);
+int loop_device_open(sd_device *dev, int open_flags, int lock_op, LoopDevice **ret);
+int loop_device_open_from_fd(int fd, int open_flags, int lock_op, LoopDevice **ret);
+int loop_device_open_from_path(const char *path, int open_flags, int lock_op, LoopDevice **ret);
+
+LoopDevice* loop_device_ref(LoopDevice *d);
+LoopDevice* loop_device_unref(LoopDevice *d);
+DEFINE_TRIVIAL_CLEANUP_FUNC(LoopDevice*, loop_device_unref);
+
+void loop_device_relinquish(LoopDevice *d);
+void loop_device_unrelinquish(LoopDevice *d);
+
+int loop_device_refresh_size(LoopDevice *d, uint64_t offset, uint64_t size);
+
+int loop_device_flock(LoopDevice *d, int operation);
+int loop_device_sync(LoopDevice *d);
+
+int loop_device_set_autoclear(LoopDevice *d, bool autoclear);
+int loop_device_set_filename(LoopDevice *d, const char *name);
diff --git a/src/shared/loopback-setup.c b/src/shared/loopback-setup.c
new file mode 100644
index 0000000..a02baf8
--- /dev/null
+++ b/src/shared/loopback-setup.c
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <net/if.h>
+#include <stdlib.h>
+
+#include "sd-netlink.h"
+
+#include "loopback-setup.h"
+#include "missing_network.h"
+#include "netlink-util.h"
+#include "time-util.h"
+
+#define LOOPBACK_SETUP_TIMEOUT_USEC (5 * USEC_PER_SEC)
+
+struct state {
+ unsigned n_messages;
+ int rcode;
+ const char *error_message;
+ const char *success_message;
+ const char *eexist_message;
+};
+
+static int generic_handler(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
+ struct state *s = ASSERT_PTR(userdata);
+ int r;
+
+ assert(s->n_messages > 0);
+ s->n_messages--;
+
+ errno = 0;
+
+ r = sd_netlink_message_get_errno(m);
+ if (r == -EEXIST && s->eexist_message)
+ log_debug_errno(r, "%s", s->eexist_message);
+ else if (r < 0)
+ log_debug_errno(r, "%s: %m", s->error_message);
+ else
+ log_debug("%s", s->success_message);
+
+ s->rcode = r;
+ return 0;
+}
+
+static int start_loopback(sd_netlink *rtnl, struct state *s) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL;
+ int r;
+
+ assert(rtnl);
+ assert(s);
+
+ r = sd_rtnl_message_new_link(rtnl, &req, RTM_SETLINK, LOOPBACK_IFINDEX);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_link_set_flags(req, IFF_UP, IFF_UP);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, LOOPBACK_SETUP_TIMEOUT_USEC, "systemd-start-loopback");
+ if (r < 0)
+ return r;
+
+ s->n_messages ++;
+ return 0;
+}
+
+static int add_ipv4_address(sd_netlink *rtnl, struct state *s) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL;
+ int r;
+
+ assert(rtnl);
+ assert(s);
+
+ r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_addr_set_prefixlen(req, 8);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_addr_set_flags(req, IFA_F_PERMANENT);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_in_addr(req, IFA_LOCAL, &(struct in_addr) { .s_addr = htobe32(INADDR_LOOPBACK) } );
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv4");
+ if (r < 0)
+ return r;
+
+ s->n_messages ++;
+ return 0;
+}
+
+static int add_ipv6_address(sd_netlink *rtnl, struct state *s) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL;
+ int r;
+
+ assert(rtnl);
+ assert(s);
+
+ r = sd_rtnl_message_new_addr(rtnl, &req, RTM_NEWADDR, LOOPBACK_IFINDEX, AF_INET6);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_addr_set_prefixlen(req, 128);
+ if (r < 0)
+ return r;
+
+ uint32_t flags = IFA_F_PERMANENT|IFA_F_NOPREFIXROUTE;
+ r = sd_rtnl_message_addr_set_flags(req, flags & 0xffu); /* rtnetlink wants low 8 bit of flags via regular flags field… */
+ if (r < 0)
+ return r;
+ if ((flags & ~0xffu) != 0) {
+ r = sd_netlink_message_append_u32(req, IFA_FLAGS, flags); /* …and the rest of the flags via IFA_FLAGS */
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_rtnl_message_addr_set_scope(req, RT_SCOPE_HOST);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_in6_addr(req, IFA_LOCAL, &in6addr_loopback);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call_async(rtnl, NULL, req, generic_handler, NULL, s, USEC_INFINITY, "systemd-loopback-ipv6");
+ if (r < 0)
+ return r;
+
+ s->n_messages ++;
+ return 0;
+}
+
+static int check_loopback(sd_netlink *rtnl) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *req = NULL, *reply = NULL;
+ unsigned flags;
+ int r;
+
+ r = sd_rtnl_message_new_link(rtnl, &req, RTM_GETLINK, LOOPBACK_IFINDEX);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_call(rtnl, req, USEC_INFINITY, &reply);
+ if (r < 0)
+ return r;
+
+ r = sd_rtnl_message_link_get_flags(reply, &flags);
+ if (r < 0)
+ return r;
+
+ return flags & IFF_UP;
+}
+
+int loopback_setup(void) {
+ _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
+ struct state state_4 = {
+ .error_message = "Failed to add address 127.0.0.1 to loopback interface",
+ .success_message = "Successfully added address 127.0.0.1 to loopback interface",
+ .eexist_message = "127.0.0.1 has already been added to loopback interface",
+ }, state_6 = {
+ .error_message = "Failed to add address ::1 to loopback interface",
+ .success_message = "Successfully added address ::1 to loopback interface",
+ .eexist_message = "::1 has already been added to loopback interface",
+ }, state_up = {
+ .error_message = "Failed to bring loopback interface up",
+ .success_message = "Successfully brought loopback interface up",
+ };
+ int r;
+
+ /* Note, we, generally assume callers ignore the return code here (except test cases), hence only log add LOG_WARN level. */
+
+ r = sd_netlink_open(&rtnl);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to open netlink, ignoring: %m");
+
+ /* Note that we add the IP addresses here explicitly even though the kernel does that too implicitly when
+ * setting up the loopback device. The reason we do this here a second time (and possibly race against the
+ * kernel) is that we want to synchronously wait until the IP addresses are set up correctly, see
+ *
+ * https://github.com/systemd/systemd/issues/5641 */
+
+ r = add_ipv4_address(rtnl, &state_4);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enqueue IPv4 loopback address add request, ignoring: %m");
+
+ r = add_ipv6_address(rtnl, &state_6);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enqueue IPv6 loopback address add request, ignoring: %m");
+
+ r = start_loopback(rtnl, &state_up);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to enqueue loopback interface start request, ignoring: %m");
+
+ while (state_4.n_messages + state_6.n_messages + state_up.n_messages > 0) {
+ r = sd_netlink_wait(rtnl, LOOPBACK_SETUP_TIMEOUT_USEC);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to wait for netlink event, ignoring: %m");
+
+ r = sd_netlink_process(rtnl, NULL);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to process netlink event, ignoring: %m");
+ }
+
+ /* Note that we don't really care whether the addresses could be added or not */
+ if (state_up.rcode != 0) {
+
+ /* If we lack the permissions to configure the loopback device, but we find it to be already
+ * configured, let's exit cleanly, in order to supported unprivileged containers. */
+ if (ERRNO_IS_PRIVILEGE(state_up.rcode)) {
+ r = check_loopback(rtnl);
+ if (r < 0)
+ log_debug_errno(r, "Failed to check if loopback device might already be up, ignoring: %m");
+ else if (r > 0) {
+ log_debug("Configuring loopback failed, but device is already up, suppressing failure.");
+ return 0;
+ }
+ }
+
+ return log_warning_errno(state_up.rcode, "Failed to configure loopback network device, ignoring: %m");
+ }
+
+ return 0;
+}
diff --git a/src/shared/loopback-setup.h b/src/shared/loopback-setup.h
new file mode 100644
index 0000000..a7ee2da
--- /dev/null
+++ b/src/shared/loopback-setup.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int loopback_setup(void);
diff --git a/src/shared/lsm-util.c b/src/shared/lsm-util.c
new file mode 100644
index 0000000..7b6d419
--- /dev/null
+++ b/src/shared/lsm-util.c
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "fileio.h"
+#include "lsm-util.h"
+#include "string-util.h"
+
+int lsm_supported(const char *name) {
+ _cleanup_free_ char *lsm_list = NULL;
+ int r;
+
+ assert(name);
+
+ r = read_one_line_file("/sys/kernel/security/lsm", &lsm_list);
+ if (r == -ENOENT) /* LSM support not available at all? */
+ return false;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read /sys/kernel/security/lsm: %m");
+
+ for (const char *p = lsm_list;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, ",", 0);
+ if (r == 0)
+ return false;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /sys/kernel/security/lsm: %m");
+
+ if (streq(word, name))
+ return true;
+ }
+}
diff --git a/src/shared/lsm-util.h b/src/shared/lsm-util.h
new file mode 100644
index 0000000..c4d9027
--- /dev/null
+++ b/src/shared/lsm-util.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int lsm_supported(const char *name);
diff --git a/src/shared/machine-credential.c b/src/shared/machine-credential.c
new file mode 100644
index 0000000..17f7afc
--- /dev/null
+++ b/src/shared/machine-credential.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "creds-util.h"
+#include "escape.h"
+#include "extract-word.h"
+#include "fileio.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "machine-credential.h"
+#include "path-util.h"
+#include "string-util-fundamental.h"
+
+static void machine_credential_done(MachineCredential *cred) {
+ assert(cred);
+
+ cred->id = mfree(cred->id);
+ cred->data = erase_and_free(cred->data);
+ cred->size = 0;
+}
+
+void machine_credential_free_all(MachineCredential *creds, size_t n) {
+ assert(creds || n == 0);
+
+ FOREACH_ARRAY(cred, creds, n)
+ machine_credential_done(cred);
+
+ free(creds);
+}
+
+int machine_credential_set(MachineCredential **credentials, size_t *n_credentials, const char *cred_string) {
+ _cleanup_free_ char *word = NULL, *data = NULL;
+ ssize_t l;
+ int r;
+ const char *p = ASSERT_PTR(cred_string);
+
+ assert(credentials && n_credentials);
+ assert(*credentials || *n_credentials == 0);
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
+ if (r == 0 || !p)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", cred_string);
+
+ if (!credential_name_valid(word))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
+
+ FOREACH_ARRAY(cred, *credentials, *n_credentials)
+ if (streq(cred->id, word))
+ return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
+
+ l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
+ if (l < 0)
+ return log_error_errno(l, "Failed to unescape credential data: %s", p);
+
+ if (!GREEDY_REALLOC(*credentials, *n_credentials + 1))
+ return log_oom();
+
+ (*credentials)[(*n_credentials)++] = (MachineCredential) {
+ .id = TAKE_PTR(word),
+ .data = TAKE_PTR(data),
+ .size = l,
+ };
+
+ return 0;
+}
+
+int machine_credential_load(MachineCredential **credentials, size_t *n_credentials, const char *cred_path) {
+ ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
+ _cleanup_(erase_and_freep) char *data = NULL;
+ _cleanup_free_ char *word = NULL, *j = NULL;
+ const char *p = ASSERT_PTR(cred_path);
+ size_t size;
+ int r;
+
+ assert(credentials && n_credentials);
+ assert(*credentials || *n_credentials == 0);
+
+ r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --load-credential= parameter: %m");
+ if (r == 0 || !p)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --load-credential=: %s", cred_path);
+
+ if (!credential_name_valid(word))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
+
+ FOREACH_ARRAY(cred, *credentials, *n_credentials)
+ if (streq(cred->id, word))
+ return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
+
+ if (is_path(p) && path_is_valid(p))
+ flags |= READ_FULL_FILE_CONNECT_SOCKET;
+ else if (credential_name_valid(p)) {
+ const char *e;
+
+ r = get_credentials_dir(&e);
+ if (r < 0)
+ return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
+
+ j = path_join(e, p);
+ if (!j)
+ return log_oom();
+
+ p = j;
+ } else
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential source appears to be neither a valid path nor a credential name: %s", p);
+
+ r = read_full_file_full(AT_FDCWD, p, UINT64_MAX, SIZE_MAX,
+ flags,
+ NULL,
+ &data, &size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read credential '%s': %m", p);
+
+ if (!GREEDY_REALLOC(*credentials, *n_credentials + 1))
+ return log_oom();
+
+ (*credentials)[(*n_credentials)++] = (MachineCredential) {
+ .id = TAKE_PTR(word),
+ .data = TAKE_PTR(data),
+ .size = size,
+ };
+
+ return 0;
+}
diff --git a/src/shared/machine-credential.h b/src/shared/machine-credential.h
new file mode 100644
index 0000000..c9044a2
--- /dev/null
+++ b/src/shared/machine-credential.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+typedef struct MachineCredential {
+ char *id;
+ void *data;
+ size_t size;
+} MachineCredential;
+
+void machine_credential_free_all(MachineCredential *creds, size_t n);
+int machine_credential_set(MachineCredential **credentials, size_t *n_credentials, const char *cred_string);
+int machine_credential_load(MachineCredential **credentials, size_t *n_credentials, const char *cred_path);
diff --git a/src/shared/machine-id-setup.c b/src/shared/machine-id-setup.c
new file mode 100644
index 0000000..3efba03
--- /dev/null
+++ b/src/shared/machine-id-setup.c
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "creds-util.h"
+#include "fd-util.h"
+#include "id128-util.h"
+#include "io-util.h"
+#include "log.h"
+#include "machine-id-setup.h"
+#include "macro.h"
+#include "mkdir.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "sync-util.h"
+#include "umask-util.h"
+#include "virt.h"
+
+static int acquire_machine_id_from_credential(sd_id128_t *ret) {
+ _cleanup_free_ char *buf = NULL;
+ int r;
+
+ r = read_credential_with_decryption("system.machine_id", (void**) &buf, /* ret_size= */ NULL);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read system.machine_id credential, ignoring: %m");
+ if (r == 0) /* not found */
+ return -ENXIO;
+
+ r = sd_id128_from_string(buf, ret);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to parse system.machine_id credential, ignoring: %m");
+
+ log_info("Initializing machine ID from credential.");
+ return 0;
+}
+
+static int generate_machine_id(const char *root, sd_id128_t *ret) {
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert(ret);
+
+ /* First, try reading the D-Bus machine id, unless it is a symlink */
+ fd = chase_and_open("/var/lib/dbus/machine-id", root, CHASE_PREFIX_ROOT | CHASE_NOFOLLOW, O_RDONLY|O_CLOEXEC|O_NOCTTY, NULL);
+ if (fd >= 0 && id128_read_fd(fd, ID128_FORMAT_PLAIN | ID128_REFUSE_NULL, ret) >= 0) {
+ log_info("Initializing machine ID from D-Bus machine ID.");
+ return 0;
+ }
+
+ if (isempty(root) && running_in_chroot() <= 0) {
+ /* Let's use a system credential for the machine ID if we can */
+ r = acquire_machine_id_from_credential(ret);
+ if (r >= 0)
+ return r;
+
+ /* If that didn't work, see if we are running in a container,
+ * and a machine ID was passed in via $container_uuid the way
+ * libvirt/LXC does it */
+
+ if (detect_container() > 0) {
+ _cleanup_free_ char *e = NULL;
+
+ if (getenv_for_pid(1, "container_uuid", &e) > 0 &&
+ sd_id128_from_string(e, ret) >= 0) {
+ log_info("Initializing machine ID from container UUID.");
+ return 0;
+ }
+
+ } else if (IN_SET(detect_vm(), VIRTUALIZATION_KVM, VIRTUALIZATION_AMAZON, VIRTUALIZATION_QEMU, VIRTUALIZATION_XEN)) {
+
+ /* If we are not running in a container, see if we are running in a VM that provides
+ * a system UUID via the SMBIOS/DMI interfaces. Such environments include QEMU/KVM
+ * with the -uuid on the qemu command line or the Amazon EC2 Nitro hypervisor. */
+
+ if (id128_get_product(ret) >= 0) {
+ log_info("Initializing machine ID from VM UUID.");
+ return 0;
+ }
+ }
+ }
+
+ /* If that didn't work, generate a random machine id */
+ r = sd_id128_randomize(ret);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate randomized machine ID: %m");
+
+ log_info("Initializing machine ID from random generator.");
+ return 0;
+}
+
+int machine_id_setup(const char *root, bool force_transient, sd_id128_t machine_id, sd_id128_t *ret) {
+ const char *etc_machine_id, *run_machine_id;
+ _cleanup_close_ int fd = -EBADF;
+ bool writable;
+ int r;
+
+ etc_machine_id = prefix_roota(root, "/etc/machine-id");
+
+ WITH_UMASK(0000) {
+ /* We create this 0444, to indicate that this isn't really
+ * something you should ever modify. Of course, since the file
+ * will be owned by root it doesn't matter much, but maybe
+ * people look. */
+
+ (void) mkdir_parents(etc_machine_id, 0755);
+ fd = open(etc_machine_id, O_RDWR|O_CREAT|O_CLOEXEC|O_NOCTTY, 0444);
+ if (fd < 0) {
+ int old_errno = errno;
+
+ fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0) {
+ if (old_errno == EROFS && errno == ENOENT)
+ return log_error_errno(errno,
+ "System cannot boot: Missing /etc/machine-id and /etc is mounted read-only.\n"
+ "Booting up is supported only when:\n"
+ "1) /etc/machine-id exists and is populated.\n"
+ "2) /etc/machine-id exists and is empty.\n"
+ "3) /etc/machine-id is missing and /etc is writable.\n");
+ else
+ return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id);
+ }
+
+ writable = false;
+ } else
+ writable = true;
+ }
+
+ /* A we got a valid machine ID argument, that's what counts */
+ if (sd_id128_is_null(machine_id)) {
+
+ /* Try to read any existing machine ID */
+ if (id128_read_fd(fd, ID128_FORMAT_PLAIN, ret) >= 0)
+ return 0;
+
+ /* Hmm, so, the id currently stored is not useful, then let's generate one */
+ r = generate_machine_id(root, &machine_id);
+ if (r < 0)
+ return r;
+ }
+
+ if (writable) {
+ if (lseek(fd, 0, SEEK_SET) < 0)
+ return log_error_errno(errno, "Failed to seek %s: %m", etc_machine_id);
+
+ if (ftruncate(fd, 0) < 0)
+ return log_error_errno(errno, "Failed to truncate %s: %m", etc_machine_id);
+
+ /* If the caller requested a transient machine-id, write the string "uninitialized\n" to
+ * disk and overmount it with a transient file.
+ *
+ * Otherwise write the machine-id directly to disk. */
+ if (force_transient) {
+ r = loop_write(fd, "uninitialized\n", SIZE_MAX);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write uninitialized %s: %m", etc_machine_id);
+
+ r = fsync_full(fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to sync %s: %m", etc_machine_id);
+ } else {
+ r = id128_write_fd(fd, ID128_FORMAT_PLAIN | ID128_SYNC_ON_WRITE, machine_id);
+ if (r < 0)
+ return log_error_errno(r, "Failed to write %s: %m", etc_machine_id);
+ else
+ goto finish;
+ }
+ }
+
+ fd = safe_close(fd);
+
+ /* Hmm, we couldn't or shouldn't write the machine-id to /etc?
+ * So let's write it to /run/machine-id as a replacement */
+
+ run_machine_id = prefix_roota(root, "/run/machine-id");
+
+ WITH_UMASK(0022)
+ r = id128_write(run_machine_id, ID128_FORMAT_PLAIN, machine_id);
+ if (r < 0) {
+ (void) unlink(run_machine_id);
+ return log_error_errno(r, "Cannot write %s: %m", run_machine_id);
+ }
+
+ /* And now, let's mount it over */
+ r = mount_follow_verbose(LOG_ERR, run_machine_id, etc_machine_id, NULL, MS_BIND, NULL);
+ if (r < 0) {
+ (void) unlink(run_machine_id);
+ return r;
+ }
+
+ log_full(force_transient ? LOG_DEBUG : LOG_INFO, "Installed transient %s file.", etc_machine_id);
+
+ /* Mark the mount read-only */
+ r = mount_follow_verbose(LOG_WARNING, NULL, etc_machine_id, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL);
+ if (r < 0)
+ return r;
+
+finish:
+ if (ret)
+ *ret = machine_id;
+
+ return 0;
+}
+
+int machine_id_commit(const char *root) {
+ _cleanup_close_ int fd = -EBADF, initial_mntns_fd = -EBADF;
+ const char *etc_machine_id;
+ sd_id128_t id;
+ int r;
+
+ /* Before doing anything, sync everything to ensure any changes by first-boot units are persisted.
+ *
+ * First, explicitly sync the file systems we care about and check if it worked. */
+ FOREACH_STRING(sync_path, "/etc/", "/var/") {
+ r = syncfs_path(AT_FDCWD, sync_path);
+ if (r < 0)
+ return log_error_errno(r, "Cannot sync %s: %m", sync_path);
+ }
+
+ /* Afterwards, sync() the rest too, but we can't check the return value for these. */
+ sync();
+
+ /* Replaces a tmpfs bind mount of /etc/machine-id by a proper file, atomically. For this, the umount is removed
+ * in a mount namespace, a new file is created at the right place. Afterwards the mount is also removed in the
+ * original mount namespace, thus revealing the file that was just created. */
+
+ etc_machine_id = prefix_roota(root, "/etc/machine-id");
+
+ r = path_is_mount_point(etc_machine_id, NULL, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", etc_machine_id);
+ if (r == 0) {
+ log_debug("%s is not a mount point. Nothing to do.", etc_machine_id);
+ return 0;
+ }
+
+ /* Read existing machine-id */
+ fd = open(etc_machine_id, O_RDONLY|O_CLOEXEC|O_NOCTTY);
+ if (fd < 0)
+ return log_error_errno(errno, "Cannot open %s: %m", etc_machine_id);
+
+ r = fd_is_temporary_fs(fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether %s is on a temporary file system: %m", etc_machine_id);
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EROFS),
+ "%s is not on a temporary file system.",
+ etc_machine_id);
+
+ r = id128_read_fd(fd, ID128_FORMAT_PLAIN, &id);
+ if (r < 0)
+ return log_error_errno(r, "We didn't find a valid machine ID in %s: %m", etc_machine_id);
+
+ fd = safe_close(fd);
+
+ /* Store current mount namespace */
+ r = namespace_open(0, NULL, &initial_mntns_fd, NULL, NULL, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Can't fetch current mount namespace: %m");
+
+ /* Switch to a new mount namespace, isolate ourself and unmount etc_machine_id in our new namespace */
+ r = detach_mount_namespace();
+ if (r < 0)
+ return log_error_errno(r, "Failed to set up new mount namespace: %m");
+
+ r = umount_verbose(LOG_ERR, etc_machine_id, 0);
+ if (r < 0)
+ return r;
+
+ /* Update a persistent version of etc_machine_id */
+ r = id128_write(etc_machine_id, ID128_FORMAT_PLAIN | ID128_SYNC_ON_WRITE, id);
+ if (r < 0)
+ return log_error_errno(r, "Cannot write %s. This is mandatory to get a persistent machine ID: %m", etc_machine_id);
+
+ /* Return to initial namespace and proceed a lazy tmpfs unmount */
+ r = namespace_enter(-1, initial_mntns_fd, -1, -1, -1);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to switch back to initial mount namespace: %m.\nWe'll keep transient %s file until next reboot.", etc_machine_id);
+
+ if (umount2(etc_machine_id, MNT_DETACH) < 0)
+ return log_warning_errno(errno, "Failed to unmount transient %s file: %m.\nWe keep that mount until next reboot.", etc_machine_id);
+
+ return 0;
+}
diff --git a/src/shared/machine-id-setup.h b/src/shared/machine-id-setup.h
new file mode 100644
index 0000000..cce5819
--- /dev/null
+++ b/src/shared/machine-id-setup.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+int machine_id_commit(const char *root);
+int machine_id_setup(const char *root, bool force_transient, sd_id128_t requested, sd_id128_t *ret);
diff --git a/src/shared/machine-pool.c b/src/shared/machine-pool.c
new file mode 100644
index 0000000..b372de4
--- /dev/null
+++ b/src/shared/machine-pool.c
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "btrfs-util.h"
+#include "label-util.h"
+#include "machine-pool.h"
+#include "missing_magic.h"
+#include "stat-util.h"
+
+static int check_btrfs(void) {
+ struct statfs sfs;
+
+ if (statfs("/var/lib/machines", &sfs) < 0) {
+ if (errno != ENOENT)
+ return -errno;
+
+ if (statfs("/var/lib", &sfs) < 0)
+ return -errno;
+ }
+
+ return F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
+}
+
+int setup_machine_directory(sd_bus_error *error, bool use_btrfs_subvol, bool use_btrfs_quota) {
+ int r;
+
+ r = check_btrfs();
+ if (r < 0)
+ return sd_bus_error_set_errnof(error, r, "Failed to determine whether /var/lib/machines is located on btrfs: %m");
+ if (r == 0)
+ return 0;
+
+ if (!use_btrfs_subvol)
+ return 0;
+
+ (void) btrfs_subvol_make_label("/var/lib/machines");
+
+ if (!use_btrfs_quota)
+ return 0;
+
+ r = btrfs_quota_enable("/var/lib/machines", true);
+ if (r < 0)
+ log_warning_errno(r, "Failed to enable quota for /var/lib/machines, ignoring: %m");
+
+ r = btrfs_subvol_auto_qgroup("/var/lib/machines", 0, true);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set up default quota hierarchy for /var/lib/machines, ignoring: %m");
+
+ return 0;
+}
diff --git a/src/shared/machine-pool.h b/src/shared/machine-pool.h
new file mode 100644
index 0000000..c57e478
--- /dev/null
+++ b/src/shared/machine-pool.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdint.h>
+
+#include "sd-bus.h"
+
+int setup_machine_directory(sd_bus_error *error, bool use_btrfs_subvol, bool use_btrfs_quota);
diff --git a/src/shared/macvlan-util.c b/src/shared/macvlan-util.c
new file mode 100644
index 0000000..11dffe9
--- /dev/null
+++ b/src/shared/macvlan-util.c
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "macvlan-util.h"
+#include "string-table.h"
+
+static const char* const macvlan_mode_table[_NETDEV_MACVLAN_MODE_MAX] = {
+ [NETDEV_MACVLAN_MODE_PRIVATE] = "private",
+ [NETDEV_MACVLAN_MODE_VEPA] = "vepa",
+ [NETDEV_MACVLAN_MODE_BRIDGE] = "bridge",
+ [NETDEV_MACVLAN_MODE_PASSTHRU] = "passthru",
+ [NETDEV_MACVLAN_MODE_SOURCE] = "source",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(macvlan_mode, MacVlanMode);
diff --git a/src/shared/macvlan-util.h b/src/shared/macvlan-util.h
new file mode 100644
index 0000000..0705ecb
--- /dev/null
+++ b/src/shared/macvlan-util.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <linux/if_link.h>
+
+typedef enum MacVlanMode {
+ NETDEV_MACVLAN_MODE_PRIVATE = MACVLAN_MODE_PRIVATE,
+ NETDEV_MACVLAN_MODE_VEPA = MACVLAN_MODE_VEPA,
+ NETDEV_MACVLAN_MODE_BRIDGE = MACVLAN_MODE_BRIDGE,
+ NETDEV_MACVLAN_MODE_PASSTHRU = MACVLAN_MODE_PASSTHRU,
+ NETDEV_MACVLAN_MODE_SOURCE = MACVLAN_MODE_SOURCE,
+ _NETDEV_MACVLAN_MODE_MAX,
+ _NETDEV_MACVLAN_MODE_INVALID = -EINVAL,
+} MacVlanMode;
+
+const char *macvlan_mode_to_string(MacVlanMode d) _const_;
+MacVlanMode macvlan_mode_from_string(const char *d) _pure_;
diff --git a/src/shared/main-func.h b/src/shared/main-func.h
new file mode 100644
index 0000000..3f6b6a8
--- /dev/null
+++ b/src/shared/main-func.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdlib.h>
+
+#include "sd-daemon.h"
+
+#include "argv-util.h"
+#include "pager.h"
+#include "selinux-util.h"
+#include "spawn-ask-password-agent.h"
+#include "spawn-polkit-agent.h"
+#include "static-destruct.h"
+
+#define _DEFINE_MAIN_FUNCTION(intro, impl, ret) \
+ int main(int argc, char *argv[]) { \
+ int r; \
+ assert_se(argc > 0 && !isempty(argv[0])); \
+ save_argc_argv(argc, argv); \
+ intro; \
+ r = impl; \
+ if (r < 0) \
+ (void) sd_notifyf(0, "ERRNO=%i", -r); \
+ (void) sd_notifyf(0, "EXIT_STATUS=%i", ret); \
+ ask_password_agent_close(); \
+ polkit_agent_close(); \
+ pager_close(); \
+ mac_selinux_finish(); \
+ static_destruct(); \
+ return ret; \
+ }
+
+/* Negative return values from impl are mapped to EXIT_FAILURE, and
+ * everything else means success! */
+#define DEFINE_MAIN_FUNCTION(impl) \
+ _DEFINE_MAIN_FUNCTION(,impl(argc, argv), r < 0 ? EXIT_FAILURE : EXIT_SUCCESS)
+
+/* Zero is mapped to EXIT_SUCCESS, negative values are mapped to EXIT_FAILURE,
+ * and positive values are propagated.
+ * Note: "true" means failure! */
+#define DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(impl) \
+ _DEFINE_MAIN_FUNCTION(,impl(argc, argv), r < 0 ? EXIT_FAILURE : r)
diff --git a/src/shared/meson.build b/src/shared/meson.build
new file mode 100644
index 0000000..b24a541
--- /dev/null
+++ b/src/shared/meson.build
@@ -0,0 +1,375 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+shared_sources = files(
+ 'acl-util.c',
+ 'acpi-fpdt.c',
+ 'apparmor-util.c',
+ 'ask-password-api.c',
+ 'async.c',
+ 'barrier.c',
+ 'base-filesystem.c',
+ 'battery-util.c',
+ 'binfmt-util.c',
+ 'bitmap.c',
+ 'blockdev-util.c',
+ 'bond-util.c',
+ 'boot-entry.c',
+ 'boot-timestamps.c',
+ 'bootspec.c',
+ 'bpf-dlopen.c',
+ 'bpf-program.c',
+ 'bridge-util.c',
+ 'btrfs-util.c',
+ 'bus-get-properties.c',
+ 'bus-locator.c',
+ 'bus-log-control-api.c',
+ 'bus-map-properties.c',
+ 'bus-message-util.c',
+ 'bus-object.c',
+ 'bus-polkit.c',
+ 'bus-print-properties.c',
+ 'bus-unit-procs.c',
+ 'bus-unit-util.c',
+ 'bus-util.c',
+ 'bus-wait-for-jobs.c',
+ 'bus-wait-for-units.c',
+ 'calendarspec.c',
+ 'cgroup-setup.c',
+ 'cgroup-show.c',
+ 'chown-recursive.c',
+ 'clean-ipc.c',
+ 'clock-util.c',
+ 'common-signal.c',
+ 'compare-operator.c',
+ 'condition.c',
+ 'conf-parser.c',
+ 'copy.c',
+ 'coredump-util.c',
+ 'cpu-set-util.c',
+ 'creds-util.c',
+ 'cryptsetup-util.c',
+ 'daemon-util.c',
+ 'data-fd-util.c',
+ 'dev-setup.c',
+ 'device-nodes.c',
+ 'discover-image.c',
+ 'dissect-image.c',
+ 'dlfcn-util.c',
+ 'dm-util.c',
+ 'dns-domain.c',
+ 'dropin.c',
+ 'edit-util.c',
+ 'efi-api.c',
+ 'efi-loader.c',
+ 'elf-util.c',
+ 'enable-mempool.c',
+ 'env-file-label.c',
+ 'ethtool-util.c',
+ 'exec-util.c',
+ 'exit-status.c',
+ 'extension-util.c',
+ 'fdset.c',
+ 'fileio-label.c',
+ 'find-esp.c',
+ 'firewall-util-nft.c',
+ 'firewall-util.c',
+ 'format-table.c',
+ 'fstab-util.c',
+ 'generator.c',
+ 'geneve-util.c',
+ 'gpt.c',
+ 'group-record.c',
+ 'hibernate-util.c',
+ 'hostname-setup.c',
+ 'hwdb-util.c',
+ 'id128-print.c',
+ 'idn-util.c',
+ 'ima-util.c',
+ 'image-policy.c',
+ 'import-util.c',
+ 'in-addr-prefix-util.c',
+ 'install-file.c',
+ 'install-printf.c',
+ 'install.c',
+ 'ip-protocol-list.c',
+ 'ipvlan-util.c',
+ 'journal-file-util.c',
+ 'journal-importer.c',
+ 'journal-util.c',
+ 'json.c',
+ 'kbd-util.c',
+ 'kernel-image.c',
+ 'keyring-util.c',
+ 'killall.c',
+ 'label-util.c',
+ 'libcrypt-util.c',
+ 'libfido2-util.c',
+ 'libmount-util.c',
+ 'local-addresses.c',
+ 'locale-setup.c',
+ 'logs-show.c',
+ 'loop-util.c',
+ 'loopback-setup.c',
+ 'lsm-util.c',
+ 'machine-credential.c',
+ 'machine-id-setup.c',
+ 'machine-pool.c',
+ 'macvlan-util.c',
+ 'mkdir-label.c',
+ 'mkfs-util.c',
+ 'mount-setup.c',
+ 'mount-util.c',
+ 'net-condition.c',
+ 'netif-naming-scheme.c',
+ 'netif-sriov.c',
+ 'netif-util.c',
+ 'nsflags.c',
+ 'numa-util.c',
+ 'open-file.c',
+ 'openssl-util.c',
+ 'output-mode.c',
+ 'pager.c',
+ 'parse-argument.c',
+ 'parse-helpers.c',
+ 'password-quality-util-passwdqc.c',
+ 'password-quality-util-pwquality.c',
+ 'pcre2-util.c',
+ 'pcrextend-util.c',
+ 'pe-binary.c',
+ 'pkcs11-util.c',
+ 'plymouth-util.c',
+ 'pretty-print.c',
+ 'ptyfwd.c',
+ 'qrcode-util.c',
+ 'quota-util.c',
+ 'reboot-util.c',
+ 'recovery-key.c',
+ 'resize-fs.c',
+ 'resolve-util.c',
+ 'rm-rf.c',
+ 'securebits-util.c',
+ 'selinux-util.c',
+ 'serialize.c',
+ 'service-util.c',
+ 'sleep-config.c',
+ 'smack-util.c',
+ 'socket-label.c',
+ 'socket-netlink.c',
+ 'spawn-ask-password-agent.c',
+ 'spawn-polkit-agent.c',
+ 'specifier.c',
+ 'switch-root.c',
+ 'tmpfile-util-label.c',
+ 'tomoyo-util.c',
+ 'tpm2-util.c',
+ 'tpm2-event-log.c',
+ 'udev-util.c',
+ 'user-record-nss.c',
+ 'user-record-show.c',
+ 'user-record.c',
+ 'userdb-dropin.c',
+ 'userdb.c',
+ 'varlink.c',
+ 'varlink-idl.c',
+ 'varlink-io.systemd.c',
+ 'varlink-io.systemd.Journal.c',
+ 'varlink-io.systemd.ManagedOOM.c',
+ 'varlink-io.systemd.PCRExtend.c',
+ 'varlink-io.systemd.Resolve.Monitor.c',
+ 'varlink-io.systemd.Resolve.c',
+ 'varlink-io.systemd.UserDatabase.c',
+ 'varlink-io.systemd.oom.c',
+ 'varlink-io.systemd.service.c',
+ 'varlink-io.systemd.sysext.c',
+ 'varlink-org.varlink.service.c',
+ 'verb-log-control.c',
+ 'verbs.c',
+ 'vlan-util.c',
+ 'volatile-util.c',
+ 'wall.c',
+ 'watchdog.c',
+ 'web-util.c',
+ 'wifi-util.c',
+ 'xml.c',
+)
+
+if get_option('tests') != 'false'
+ shared_sources += files(
+ 'tests.c',
+ )
+endif
+
+generate_syscall_list = find_program('generate-syscall-list.py')
+fname = 'syscall-list.h'
+syscall_list_h = custom_target(
+ fname,
+ input : syscall_list_txt,
+ output : fname,
+ command : [generate_syscall_list,
+ '@INPUT@'],
+ capture : true)
+
+if conf.get('HAVE_ACL') == 1
+ shared_sources += files(
+ 'devnode-acl.c',
+ )
+endif
+
+if conf.get('ENABLE_UTMP') == 1
+ shared_sources += files('utmp-wtmp.c')
+endif
+
+if conf.get('HAVE_SECCOMP') == 1
+ shared_sources += files('seccomp-util.c')
+ shared_sources += syscall_list_h
+endif
+
+if conf.get('HAVE_LIBIPTC') == 1
+ shared_sources += files('firewall-util-iptables.c')
+endif
+
+if conf.get('HAVE_LIBBPF') == 1
+ shared_sources += files(
+ 'bpf-link.c',
+ )
+endif
+
+if conf.get('HAVE_KMOD') == 1
+ shared_sources += files('module-util.c')
+endif
+
+if conf.get('HAVE_PAM') == 1
+ shared_sources += files(
+ 'pam-util.c',
+ )
+endif
+
+if conf.get('ENABLE_NSCD') == 1
+ shared_sources += files('nscd-flush.c')
+endif
+
+if conf.get('HAVE_LIBFIDO2') == 1 and conf.get('HAVE_LIBCRYPTSETUP') == 1
+ shared_sources += files('cryptsetup-fido2.c')
+endif
+
+generate_ip_protocol_list = find_program('generate-ip-protocol-list.sh')
+ip_protocol_list_txt = custom_target(
+ 'ip-protocol-list.txt',
+ output : 'ip-protocol-list.txt',
+ command : [generate_ip_protocol_list, cpp],
+ capture : true)
+
+fname = 'ip-protocol-from-name.gperf'
+gperf_file = custom_target(
+ fname,
+ input : ip_protocol_list_txt,
+ output : fname,
+ command : [generate_gperfs, 'ip_protocol', 'IPPROTO_', '@INPUT@'],
+ capture : true)
+
+fname = 'ip-protocol-from-name.h'
+target1 = custom_target(
+ fname,
+ input : gperf_file,
+ output : fname,
+ command : [gperf,
+ '-L', 'ANSI-C', '-t', '--ignore-case',
+ '-N', 'lookup_ip_protocol',
+ '-H', 'hash_ip_protocol_name',
+ '-p', '-C',
+ '@INPUT@'],
+ capture : true)
+
+fname = 'ip-protocol-to-name.h'
+awkscript = 'ip-protocol-to-name.awk'
+target2 = custom_target(
+ fname,
+ input : [awkscript, ip_protocol_list_txt],
+ output : fname,
+ command : [awk, '-f', '@INPUT0@', '@INPUT1@'],
+ capture : true)
+
+shared_generated_gperf_headers = [target1, target2]
+shared_sources += shared_generated_gperf_headers
+
+fname = 'ethtool-link-mode.h'
+ethtool_link_mode_h = custom_target(
+ fname,
+ input : ['ethtool-link-mode.py', 'linux/ethtool.h'],
+ output : fname,
+ command : [python, '@INPUT0@', '--header', cpp, '@INPUT1@'],
+ capture : true)
+shared_sources += ethtool_link_mode_h
+
+fname = 'ethtool-link-mode.xml'
+ethtool_link_mode_xml = custom_target(
+ fname,
+ input : ['ethtool-link-mode.py', 'linux/ethtool.h'],
+ output : fname,
+ command : [python, '@INPUT0@', '--xml', cpp, '@INPUT1@'],
+ capture : true)
+man_page_depends += ethtool_link_mode_xml
+
+libshared_name = 'systemd-shared-@0@'.format(shared_lib_tag)
+
+libshared_deps = [threads,
+ libacl,
+ libblkid,
+ libcap,
+ libcrypt,
+ libdl,
+ libgcrypt,
+ libiptc_cflags,
+ libkmod,
+ liblz4,
+ libmount,
+ libopenssl,
+ libp11kit_cflags,
+ libpam,
+ librt,
+ libseccomp,
+ libselinux,
+ libxenctrl_cflags,
+ libxz,
+ libzstd]
+
+libshared_sym_path = meson.current_source_dir() / 'libshared.sym'
+libshared_build_dir = meson.current_build_dir()
+
+libshared_static = static_library(
+ libshared_name,
+ shared_sources,
+ include_directories : includes,
+ dependencies : [libshared_deps,
+ userspace],
+ c_args : ['-fvisibility=default'],
+ build_by_default : false)
+
+libshared = shared_library(
+ libshared_name,
+ include_directories : includes,
+ c_args : ['-fvisibility=default'],
+ link_args : ['-shared',
+ '-Wl,--version-script=' + libshared_sym_path],
+ link_depends : libshared_sym_path,
+ link_whole : [libshared_static,
+ libbasic,
+ libbasic_gcrypt,
+ libsystemd_static],
+ dependencies : [libshared_deps,
+ userspace],
+ install : true,
+ install_dir : pkglibdir)
+
+shared_fdisk_sources = files(
+ 'fdisk-util.c',
+)
+
+libshared_fdisk = static_library(
+ 'shared-fdisk',
+ shared_fdisk_sources,
+ include_directories : includes,
+ dependencies : [libfdisk,
+ userspace],
+ c_args : ['-fvisibility=default'],
+ build_by_default : false)
diff --git a/src/shared/mkdir-label.c b/src/shared/mkdir-label.c
new file mode 100644
index 0000000..e3afc2b
--- /dev/null
+++ b/src/shared/mkdir-label.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+
+#include "errno-util.h"
+#include "mkdir-label.h"
+#include "selinux-util.h"
+#include "smack-util.h"
+#include "user-util.h"
+
+int mkdirat_label(int dirfd, const char *path, mode_t mode) {
+ int r;
+
+ assert(path);
+
+ r = mac_selinux_create_file_prepare_at(dirfd, path, S_IFDIR);
+ if (r < 0)
+ return r;
+
+ r = RET_NERRNO(mkdirat(dirfd, path, mode));
+ mac_selinux_create_file_clear();
+ if (r < 0)
+ return r;
+
+ return mac_smack_fix_full(dirfd, path, NULL, 0);
+}
+
+int mkdirat_safe_label(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) {
+ return mkdirat_safe_internal(dir_fd, path, mode, uid, gid, flags, mkdirat_label);
+}
+
+int mkdirat_parents_label(int dir_fd, const char *path, mode_t mode) {
+ return mkdirat_parents_internal(dir_fd, path, mode, UID_INVALID, UID_INVALID, 0, mkdirat_label);
+}
+
+int mkdir_parents_safe_label(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) {
+ return mkdir_parents_internal(prefix, path, mode, uid, gid, flags, mkdirat_label);
+}
+
+int mkdir_p_label(const char *path, mode_t mode) {
+ return mkdir_p_internal(NULL, path, mode, UID_INVALID, UID_INVALID, 0, mkdirat_label);
+}
diff --git a/src/shared/mkdir-label.h b/src/shared/mkdir-label.h
new file mode 100644
index 0000000..a9a8ce3
--- /dev/null
+++ b/src/shared/mkdir-label.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <sys/types.h>
+
+#include "mkdir.h"
+
+int mkdirat_label(int dirfd, const char *path, mode_t mode);
+
+static inline int mkdir_label(const char *path, mode_t mode) {
+ return mkdirat_label(AT_FDCWD, path, mode);
+}
+
+int mkdirat_safe_label(int dir_fd, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags);
+static inline int mkdir_safe_label(const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags) {
+ return mkdirat_safe_label(AT_FDCWD, path, mode, uid, gid, flags);
+}
+int mkdirat_parents_label(int dir_fd, const char *path, mode_t mod);
+static inline int mkdir_parents_label(const char *path, mode_t mod) {
+ return mkdirat_parents_label(AT_FDCWD, path, mod);
+}
+
+int mkdir_parents_safe_label(const char *prefix, const char *path, mode_t mode, uid_t uid, gid_t gid, MkdirFlags flags);
+
+int mkdir_p_label(const char *path, mode_t mode);
diff --git a/src/shared/mkfs-util.c b/src/shared/mkfs-util.c
new file mode 100644
index 0000000..4e58b6e
--- /dev/null
+++ b/src/shared/mkfs-util.c
@@ -0,0 +1,684 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "id128-util.h"
+#include "mkfs-util.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "recurse-dir.h"
+#include "rm-rf.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "tmpfile-util.h"
+#include "utf8.h"
+
+int mkfs_exists(const char *fstype) {
+ const char *mkfs;
+ int r;
+
+ assert(fstype);
+
+ if (STR_IN_SET(fstype, "auto", "swap")) /* these aren't real file system types, refuse early */
+ return -EINVAL;
+
+ mkfs = strjoina("mkfs.", fstype);
+ if (!filename_is_valid(mkfs)) /* refuse file system types with slashes and similar */
+ return -EINVAL;
+
+ r = find_executable(mkfs, NULL);
+ if (r == -ENOENT)
+ return false;
+ if (r < 0)
+ return r;
+
+ return true;
+}
+
+int mkfs_supports_root_option(const char *fstype) {
+ return fstype_is_ro(fstype) || STR_IN_SET(fstype, "ext2", "ext3", "ext4", "btrfs", "vfat", "xfs");
+}
+
+static int mangle_linux_fs_label(const char *s, size_t max_len, char **ret) {
+ /* Not more than max_len bytes (12 or 16) */
+
+ assert(s);
+ assert(max_len > 0);
+ assert(ret);
+
+ const char *q;
+ char *ans;
+
+ for (q = s; *q;) {
+ int l;
+
+ l = utf8_encoded_valid_unichar(q, SIZE_MAX);
+ if (l < 0)
+ return l;
+
+ if ((size_t) (q - s + l) > max_len)
+ break;
+ q += l;
+ }
+
+ ans = memdup_suffix0(s, q - s);
+ if (!ans)
+ return -ENOMEM;
+
+ *ret = ans;
+ return 0;
+}
+
+static int mangle_fat_label(const char *s, char **ret) {
+ assert(s);
+
+ _cleanup_free_ char *q = NULL;
+ int r;
+
+ r = utf8_to_ascii(s, '_', &q);
+ if (r < 0)
+ return r;
+
+ /* Classic FAT only allows 11 character uppercase labels */
+ strshorten(q, 11);
+ ascii_strupper(q);
+
+ /* mkfs.vfat: Labels with characters *?.,;:/\|+=<>[]" are not allowed.
+ * Let's also replace any control chars. */
+ for (char *p = q; *p; p++)
+ if (strchr("*?.,;:/\\|+=<>[]\"", *p) || char_is_cc(*p))
+ *p = '_';
+
+ *ret = TAKE_PTR(q);
+ return 0;
+}
+
+static int do_mcopy(const char *node, const char *root) {
+ _cleanup_free_ char *mcopy = NULL;
+ _cleanup_strv_free_ char **argv = NULL;
+ _cleanup_close_ int rfd = -EBADF;
+ _cleanup_free_ DirectoryEntries *de = NULL;
+ int r;
+
+ assert(node);
+ assert(root);
+
+ /* Return early if there's nothing to copy. */
+ if (dir_is_empty(root, /*ignore_hidden_or_backup=*/ false))
+ return 0;
+
+ r = find_executable("mcopy", &mcopy);
+ if (r == -ENOENT)
+ return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "Could not find mcopy binary.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether mcopy binary exists: %m");
+
+ argv = strv_new(mcopy, "-s", "-p", "-Q", "-m", "-i", node);
+ if (!argv)
+ return log_oom();
+
+ /* mcopy copies the top level directory instead of everything in it so we have to pass all
+ * the subdirectories to mcopy instead to end up with the correct directory structure. */
+
+ rfd = open(root, O_RDONLY|O_DIRECTORY|O_CLOEXEC);
+ if (rfd < 0)
+ return log_error_errno(errno, "Failed to open directory '%s': %m", root);
+
+ r = readdir_all(rfd, RECURSE_DIR_SORT|RECURSE_DIR_ENSURE_TYPE, &de);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read '%s' contents: %m", root);
+
+ for (size_t i = 0; i < de->n_entries; i++) {
+ _cleanup_free_ char *p = NULL;
+
+ p = path_join(root, de->entries[i]->d_name);
+ if (!p)
+ return log_oom();
+
+ if (!IN_SET(de->entries[i]->d_type, DT_REG, DT_DIR)) {
+ log_debug("%s is not a file/directory which are the only file types supported by vfat, ignoring", p);
+ continue;
+ }
+
+ if (strv_consume(&argv, TAKE_PTR(p)) < 0)
+ return log_oom();
+ }
+
+ if (strv_extend(&argv, "::") < 0)
+ return log_oom();
+
+ r = safe_fork("(mcopy)", FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|FORK_STDOUT_TO_STDERR|FORK_CLOSE_ALL_FDS, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Avoid failures caused by mismatch in expectations between mkfs.vfat and mcopy by disabling
+ * the stricter mcopy checks using MTOOLS_SKIP_CHECK. */
+ execve(mcopy, argv, STRV_MAKE("MTOOLS_SKIP_CHECK=1", "TZ=UTC", strv_find_prefix(environ, "SOURCE_DATE_EPOCH=")));
+
+ log_error_errno(errno, "Failed to execute mcopy: %m");
+
+ _exit(EXIT_FAILURE);
+ }
+
+ return 0;
+}
+
+typedef struct ProtofileData {
+ FILE *file;
+ bool has_filename_with_spaces;
+ const char *tmpdir;
+} ProtofileData;
+
+static int protofile_print_item(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ ProtofileData *data = ASSERT_PTR(userdata);
+ _cleanup_free_ char *copy = NULL;
+ int r;
+
+ if (event == RECURSE_DIR_LEAVE) {
+ fputs("$\n", data->file);
+ return 0;
+ }
+
+ if (!IN_SET(event, RECURSE_DIR_ENTER, RECURSE_DIR_ENTRY))
+ return RECURSE_DIR_CONTINUE;
+
+ char type = S_ISDIR(sx->stx_mode) ? 'd' :
+ S_ISREG(sx->stx_mode) ? '-' :
+ S_ISLNK(sx->stx_mode) ? 'l' :
+ S_ISFIFO(sx->stx_mode) ? 'p' :
+ S_ISBLK(sx->stx_mode) ? 'b' :
+ S_ISCHR(sx->stx_mode) ? 'c' : 0;
+ if (type == 0)
+ return RECURSE_DIR_CONTINUE;
+
+ /* The protofile format does not support spaces in filenames as whitespace is used as a token
+ * delimiter. To work around this limitation, mkfs.xfs allows escaping whitespace by using the /
+ * character (which isn't allowed in filenames and as such can be used to escape whitespace). See
+ * https://lore.kernel.org/linux-xfs/20230222090303.h6tujm7y32gjhgal@andromeda/T/#m8066b3e7d62a080ee7434faac4861d944e64493b
+ * for more information.*/
+
+ if (strchr(de->d_name, ' ')) {
+ copy = strdup(de->d_name);
+ if (!copy)
+ return log_oom();
+
+ string_replace_char(copy, ' ', '/');
+ data->has_filename_with_spaces = true;
+ }
+
+ fprintf(data->file, "%s %c%c%c%03o "UID_FMT" "GID_FMT" ",
+ copy ?: de->d_name,
+ type,
+ sx->stx_mode & S_ISUID ? 'u' : '-',
+ sx->stx_mode & S_ISGID ? 'g' : '-',
+ (unsigned) (sx->stx_mode & 0777),
+ sx->stx_uid, sx->stx_gid);
+
+ if (S_ISREG(sx->stx_mode)) {
+ _cleanup_free_ char *p = NULL;
+
+ /* While we can escape whitespace in the filename, we cannot escape whitespace in the source
+ * path, so hack around that by creating a symlink to the path in a temporary directory and
+ * using the symlink as the source path instead. */
+
+ if (strchr(path, ' ')) {
+ r = tempfn_random_child(data->tmpdir, "mkfs-xfs", &p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to generate random child name in %s: %m", data->tmpdir);
+
+ if (symlink(path, p) < 0)
+ return log_error_errno(errno, "Failed to symlink %s to %s: %m", p, path);
+ }
+
+ fputs(p ?: path, data->file);
+ } else if (S_ISLNK(sx->stx_mode)) {
+ _cleanup_free_ char *p = NULL;
+
+ r = readlinkat_malloc(dir_fd, de->d_name, &p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read symlink %s: %m", path);
+
+ /* If we have a symlink to a path with whitespace in it, we're out of luck, as there's no way
+ * to encode that in the mkfs.xfs protofile format. */
+
+ if (strchr(p, ' '))
+ return log_error_errno(r, "Symlinks to paths containing whitespace are not supported by mkfs.xfs: %m");
+
+ fputs(p, data->file);
+ } else if (S_ISBLK(sx->stx_mode) || S_ISCHR(sx->stx_mode))
+ fprintf(data->file, "%" PRIu32 " %" PRIu32, sx->stx_rdev_major, sx->stx_rdev_minor);
+
+ fputc('\n', data->file);
+
+ return RECURSE_DIR_CONTINUE;
+}
+
+static int make_protofile(const char *root, char **ret_path, bool *ret_has_filename_with_spaces, char **ret_tmpdir) {
+ _cleanup_(rm_rf_physical_and_freep) char *tmpdir = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_(unlink_and_freep) char *p = NULL;
+ struct ProtofileData data = {};
+ const char *vt;
+ int r;
+
+ assert(ret_path);
+ assert(ret_has_filename_with_spaces);
+ assert(ret_tmpdir);
+
+ r = var_tmp_dir(&vt);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get persistent temporary directory: %m");
+
+ r = fopen_temporary_child(vt, &f, &p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to open temporary file: %m");
+
+ /* Explicitly use /tmp here because this directory cannot have spaces its path. */
+ r = mkdtemp_malloc("/tmp/systemd-mkfs-XXXXXX", &tmpdir);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create temporary directory: %m");
+
+ data.file = f;
+ data.tmpdir = tmpdir;
+
+ fputs("/\n"
+ "0 0\n"
+ "d--755 0 0\n", f);
+
+ r = recurse_dir_at(AT_FDCWD, root, STATX_TYPE|STATX_MODE|STATX_UID|STATX_GID, UINT_MAX,
+ RECURSE_DIR_SORT, protofile_print_item, &data);
+ if (r < 0)
+ return log_error_errno(r, "Failed to recurse through %s: %m", root);
+
+ fputs("$\n", f);
+
+ r = fflush_and_check(f);
+ if (r < 0)
+ return log_error_errno(r, "Failed to flush %s: %m", p);
+
+ *ret_path = TAKE_PTR(p);
+ *ret_has_filename_with_spaces = data.has_filename_with_spaces;
+ *ret_tmpdir = TAKE_PTR(tmpdir);
+
+ return 0;
+}
+
+int make_filesystem(
+ const char *node,
+ const char *fstype,
+ const char *label,
+ const char *root,
+ sd_id128_t uuid,
+ bool discard,
+ bool quiet,
+ uint64_t sector_size,
+ char * const *extra_mkfs_args) {
+
+ _cleanup_free_ char *mkfs = NULL, *mangled_label = NULL;
+ _cleanup_strv_free_ char **argv = NULL, **env = NULL;
+ _cleanup_(rm_rf_physical_and_freep) char *protofile_tmpdir = NULL;
+ _cleanup_(unlink_and_freep) char *protofile = NULL;
+ char vol_id[CONST_MAX(SD_ID128_UUID_STRING_MAX, 8U + 1U)] = {};
+ int stdio_fds[3] = { -EBADF, STDERR_FILENO, STDERR_FILENO};
+ ForkFlags flags = FORK_RESET_SIGNALS|FORK_RLIMIT_NOFILE_SAFE|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_WAIT|
+ FORK_CLOSE_ALL_FDS|FORK_REARRANGE_STDIO|FORK_REOPEN_LOG;
+ int r;
+
+ assert(node);
+ assert(fstype);
+ assert(label);
+
+ if (fstype_is_ro(fstype) && !root)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot generate read-only filesystem %s without a source tree.",
+ fstype);
+
+ if (streq(fstype, "swap")) {
+ if (root)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "A swap filesystem can't be populated, refusing");
+ r = find_executable("mkswap", &mkfs);
+ if (r == -ENOENT)
+ return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkswap binary not available.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether mkswap binary exists: %m");
+ } else if (streq(fstype, "squashfs")) {
+ r = find_executable("mksquashfs", &mkfs);
+ if (r == -ENOENT)
+ return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mksquashfs binary not available.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether mksquashfs binary exists: %m");
+
+ } else if (streq(fstype, "erofs")) {
+ r = find_executable("mkfs.erofs", &mkfs);
+ if (r == -ENOENT)
+ return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkfs.erofs binary not available.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether mkfs.erofs binary exists: %m");
+
+ } else if (fstype_is_ro(fstype)) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Don't know how to create read-only file system '%s', refusing.",
+ fstype);
+ } else {
+ if (root && !mkfs_supports_root_option(fstype))
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Populating with source tree is not supported for %s", fstype);
+ r = mkfs_exists(fstype);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether mkfs binary for %s exists: %m", fstype);
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "mkfs binary for %s is not available.", fstype);
+
+ mkfs = strjoin("mkfs.", fstype);
+ if (!mkfs)
+ return log_oom();
+ }
+
+ if (STR_IN_SET(fstype, "ext2", "ext3", "ext4", "xfs", "swap")) {
+ size_t max_len =
+ streq(fstype, "xfs") ? 12 :
+ streq(fstype, "swap") ? 15 :
+ 16;
+
+ r = mangle_linux_fs_label(label, max_len, &mangled_label);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine volume label from string \"%s\": %m", label);
+ label = mangled_label;
+
+ } else if (streq(fstype, "vfat")) {
+ r = mangle_fat_label(label, &mangled_label);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine FAT label from string \"%s\": %m", label);
+ label = mangled_label;
+
+ xsprintf(vol_id, "%08" PRIx32,
+ ((uint32_t) uuid.bytes[0] << 24) |
+ ((uint32_t) uuid.bytes[1] << 16) |
+ ((uint32_t) uuid.bytes[2] << 8) |
+ ((uint32_t) uuid.bytes[3])); /* Take first 32 bytes of UUID */
+ }
+
+ if (isempty(vol_id))
+ assert_se(sd_id128_to_uuid_string(uuid, vol_id));
+
+ /* When changing this conditional, also adjust the log statement below. */
+ if (STR_IN_SET(fstype, "ext2", "ext3", "ext4")) {
+ argv = strv_new(mkfs,
+ "-L", label,
+ "-U", vol_id,
+ "-I", "256",
+ "-m", "0",
+ "-E", discard ? "discard,lazy_itable_init=1" : "nodiscard,lazy_itable_init=1",
+ "-b", "4096",
+ "-T", "default",
+ node);
+
+ if (root && strv_extend_strv(&argv, STRV_MAKE("-d", root), false) < 0)
+ return log_oom();
+
+ if (quiet && strv_extend(&argv, "-q") < 0)
+ return log_oom();
+
+ if (sector_size > 0) {
+ if (strv_extend(&env, "MKE2FS_DEVICE_SECTSIZE") < 0)
+ return log_oom();
+
+ if (strv_extendf(&env, "%"PRIu64, sector_size) < 0)
+ return log_oom();
+ }
+
+ } else if (streq(fstype, "btrfs")) {
+ argv = strv_new(mkfs,
+ "-L", label,
+ "-U", vol_id,
+ node);
+ if (!argv)
+ return log_oom();
+
+ if (!discard && strv_extend(&argv, "--nodiscard") < 0)
+ return log_oom();
+
+ if (root && strv_extend_strv(&argv, STRV_MAKE("-r", root), false) < 0)
+ return log_oom();
+
+ if (quiet && strv_extend(&argv, "-q") < 0)
+ return log_oom();
+
+ /* mkfs.btrfs unconditionally warns about several settings changing from v5.15 onwards which
+ * isn't silenced by "-q", so let's redirect stdout to /dev/null as well. */
+ if (quiet)
+ stdio_fds[1] = -EBADF;
+
+ } else if (streq(fstype, "f2fs")) {
+ argv = strv_new(mkfs,
+ "-g", /* "default options" */
+ "-f", /* force override, without this it doesn't seem to want to write to an empty partition */
+ "-l", label,
+ "-U", vol_id,
+ "-t", one_zero(discard),
+ node);
+
+ if (quiet && strv_extend(&argv, "-q") < 0)
+ return log_oom();
+
+ if (sector_size > 0) {
+ if (strv_extend(&argv, "-w") < 0)
+ return log_oom();
+
+ if (strv_extendf(&argv, "%"PRIu64, sector_size) < 0)
+ return log_oom();
+ }
+
+ } else if (streq(fstype, "xfs")) {
+ const char *j;
+
+ j = strjoina("uuid=", vol_id);
+
+ argv = strv_new(mkfs,
+ "-L", label,
+ "-m", j,
+ "-m", "reflink=1",
+ node);
+ if (!argv)
+ return log_oom();
+
+ if (!discard && strv_extend(&argv, "-K") < 0)
+ return log_oom();
+
+ if (root) {
+ bool has_filename_with_spaces = false;
+ _cleanup_free_ char *protofile_with_opt = NULL;
+
+ r = make_protofile(root, &protofile, &has_filename_with_spaces, &protofile_tmpdir);
+ if (r < 0)
+ return r;
+
+ /* Gross hack to make mkfs.xfs interpret slashes as spaces so we can encode filenames
+ * with spaces in the protofile format. */
+ if (has_filename_with_spaces)
+ protofile_with_opt = strjoin("slashes_are_spaces=1,", protofile);
+ else
+ protofile_with_opt = strdup(protofile);
+ if (!protofile_with_opt)
+ return -ENOMEM;
+
+ if (strv_extend_strv(&argv, STRV_MAKE("-p", protofile_with_opt), false) < 0)
+ return log_oom();
+ }
+
+ if (sector_size > 0) {
+ if (strv_extend(&argv, "-s") < 0)
+ return log_oom();
+
+ if (strv_extendf(&argv, "size=%"PRIu64, sector_size) < 0)
+ return log_oom();
+ }
+
+ if (quiet && strv_extend(&argv, "-q") < 0)
+ return log_oom();
+
+ } else if (streq(fstype, "vfat")) {
+
+ argv = strv_new(mkfs,
+ "-i", vol_id,
+ "-n", label,
+ "-F", "32", /* yes, we force FAT32 here */
+ node);
+
+ if (sector_size > 0) {
+ if (strv_extend(&argv, "-S") < 0)
+ return log_oom();
+
+ if (strv_extendf(&argv, "%"PRIu64, sector_size) < 0)
+ return log_oom();
+ }
+
+ /* mkfs.vfat does not have a --quiet option so let's redirect stdout to /dev/null instead. */
+ if (quiet)
+ stdio_fds[1] = -EBADF;
+
+ } else if (streq(fstype, "swap")) {
+ /* TODO: add --quiet once util-linux v2.38 is available everywhere. */
+
+ argv = strv_new(mkfs,
+ "-L", label,
+ "-U", vol_id,
+ node);
+
+ if (quiet)
+ stdio_fds[1] = -EBADF;
+
+ } else if (streq(fstype, "squashfs")) {
+
+ argv = strv_new(mkfs,
+ root, node,
+ "-noappend");
+
+ /* mksquashfs -quiet option is pretty new so let's redirect stdout to /dev/null instead. */
+ if (quiet)
+ stdio_fds[1] = -EBADF;
+
+ } else if (streq(fstype, "erofs")) {
+
+ argv = strv_new(mkfs,
+ "-U", vol_id,
+ node, root);
+
+ if (quiet && strv_extend(&argv, "--quiet") < 0)
+ return log_oom();
+
+ } else
+ /* Generic fallback for all other file systems */
+ argv = strv_new(mkfs, node);
+
+ if (!argv)
+ return log_oom();
+
+ if (extra_mkfs_args && strv_extend_strv(&argv, extra_mkfs_args, false) < 0)
+ return log_oom();
+
+ if (streq(fstype, "btrfs")) {
+ struct stat st;
+
+ if (stat(node, &st) < 0)
+ return log_error_errno(r, "Failed to stat '%s': %m", node);
+
+ if (S_ISBLK(st.st_mode))
+ flags |= FORK_NEW_MOUNTNS;
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *j = NULL;
+
+ j = strv_join(argv, " ");
+ log_debug("Executing mkfs command: %s", strna(j));
+ }
+
+ r = safe_fork_full(
+ "(mkfs)",
+ stdio_fds,
+ /*except_fds=*/ NULL,
+ /*n_except_fds=*/ 0,
+ flags,
+ /*ret_pid=*/ NULL);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Child */
+
+ STRV_FOREACH_PAIR(k, v, env)
+ if (setenv(*k, *v, /* replace = */ true) < 0) {
+ log_error_errno(r, "Failed to set %s=%s environment variable: %m", *k, *v);
+ _exit(EXIT_FAILURE);
+ }
+
+ /* mkfs.btrfs refuses to operate on block devices with mounted partitions, even if operating
+ * on unformatted free space, so let's trick it and other mkfs tools into thinking no
+ * partitions are mounted. See https://github.com/kdave/btrfs-progs/issues/640 for more
+ ° information. */
+ if (flags & FORK_NEW_MOUNTNS)
+ (void) mount_nofollow_verbose(LOG_DEBUG, "/dev/null", "/proc/self/mounts", NULL, MS_BIND, NULL);
+
+ execvp(mkfs, argv);
+
+ log_error_errno(errno, "Failed to execute %s: %m", mkfs);
+
+ _exit(EXIT_FAILURE);
+ }
+
+ if (root && streq(fstype, "vfat")) {
+ r = do_mcopy(node, root);
+ if (r < 0)
+ return r;
+ }
+
+ if (STR_IN_SET(fstype, "ext2", "ext3", "ext4", "btrfs", "f2fs", "xfs", "vfat", "swap"))
+ log_info("%s successfully formatted as %s (label \"%s\", uuid %s)",
+ node, fstype, label, vol_id);
+ else if (streq(fstype, "erofs"))
+ log_info("%s successfully formatted as %s (uuid %s, no label)",
+ node, fstype, vol_id);
+ else
+ log_info("%s successfully formatted as %s (no label or uuid specified)",
+ node, fstype);
+
+ return 0;
+}
+
+int mkfs_options_from_env(const char *component, const char *fstype, char ***ret) {
+ _cleanup_strv_free_ char **l = NULL;
+ const char *e;
+ char *n;
+
+ assert(component);
+ assert(fstype);
+ assert(ret);
+
+ n = strjoina("SYSTEMD_", component, "_MKFS_OPTIONS_", fstype);
+ e = getenv(ascii_strupper(n));
+ if (e) {
+ l = strv_split(e, NULL);
+ if (!l)
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(l);
+ return 0;
+}
diff --git a/src/shared/mkfs-util.h b/src/shared/mkfs-util.h
new file mode 100644
index 0000000..9a1cb58
--- /dev/null
+++ b/src/shared/mkfs-util.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-id128.h"
+
+#include "strv.h"
+
+int mkfs_exists(const char *fstype);
+
+int mkfs_supports_root_option(const char *fstype);
+
+int make_filesystem(
+ const char *node,
+ const char *fstype,
+ const char *label,
+ const char *root,
+ sd_id128_t uuid,
+ bool discard,
+ bool quiet,
+ uint64_t sector_size,
+ char * const *extra_mkfs_args);
+
+int mkfs_options_from_env(const char *component, const char *fstype, char ***ret);
diff --git a/src/shared/module-util.c b/src/shared/module-util.c
new file mode 100644
index 0000000..951701d
--- /dev/null
+++ b/src/shared/module-util.c
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "module-util.h"
+#include "proc-cmdline.h"
+#include "strv.h"
+
+static int denylist_modules(const char *p, char ***denylist) {
+ _cleanup_strv_free_ char **k = NULL;
+
+ assert(p);
+ assert(denylist);
+
+ k = strv_split(p, ",");
+ if (!k)
+ return -ENOMEM;
+
+ if (strv_extend_strv(denylist, k, true) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
+ int r;
+
+ if (proc_cmdline_key_streq(key, "module_blacklist")) {
+
+ if (proc_cmdline_value_missing(key, value))
+ return 0;
+
+ r = denylist_modules(value, data);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int module_load_and_warn(struct kmod_ctx *ctx, const char *module, bool verbose) {
+ const int probe_flags = KMOD_PROBE_APPLY_BLACKLIST;
+ struct kmod_list *itr;
+ _cleanup_(kmod_module_unref_listp) struct kmod_list *modlist = NULL;
+ _cleanup_strv_free_ char **denylist = NULL;
+ bool denylist_parsed = false;
+ int r;
+
+ /* verbose==true means we should log at non-debug level if we
+ * fail to find or load the module. */
+
+ log_debug("Loading module: %s", module);
+
+ r = kmod_module_new_from_lookup(ctx, module, &modlist);
+ if (r < 0)
+ return log_full_errno(verbose ? LOG_ERR : LOG_DEBUG, r,
+ "Failed to look up module alias '%s': %m", module);
+
+ if (!modlist)
+ return log_full_errno(verbose ? LOG_ERR : LOG_DEBUG,
+ SYNTHETIC_ERRNO(ENOENT),
+ "Failed to find module '%s'", module);
+
+ kmod_list_foreach(itr, modlist) {
+ _cleanup_(kmod_module_unrefp) struct kmod_module *mod = NULL;
+ int state, err;
+
+ mod = kmod_module_get_module(itr);
+ state = kmod_module_get_initstate(mod);
+
+ switch (state) {
+ case KMOD_MODULE_BUILTIN:
+ log_full(verbose ? LOG_INFO : LOG_DEBUG,
+ "Module '%s' is built in", kmod_module_get_name(mod));
+ break;
+
+ case KMOD_MODULE_LIVE:
+ log_debug("Module '%s' is already loaded", kmod_module_get_name(mod));
+ break;
+
+ default:
+ err = kmod_module_probe_insert_module(mod, probe_flags,
+ NULL, NULL, NULL, NULL);
+ if (err == 0)
+ log_full(verbose ? LOG_INFO : LOG_DEBUG,
+ "Inserted module '%s'", kmod_module_get_name(mod));
+ else if (err == KMOD_PROBE_APPLY_BLACKLIST)
+ log_full(verbose ? LOG_INFO : LOG_DEBUG,
+ "Module '%s' is deny-listed (by kmod)", kmod_module_get_name(mod));
+ else {
+ assert(err < 0);
+
+ if (err == -EPERM) {
+ if (!denylist_parsed) {
+ r = proc_cmdline_parse(parse_proc_cmdline_item, &denylist, 0);
+ if (r < 0)
+ log_full_errno(!verbose ? LOG_DEBUG : LOG_WARNING,
+ r,
+ "Failed to parse kernel command line, ignoring: %m");
+
+ denylist_parsed = true;
+ }
+ if (strv_contains(denylist, kmod_module_get_name(mod))) {
+ log_full(verbose ? LOG_INFO : LOG_DEBUG,
+ "Module '%s' is deny-listed (by kernel)", kmod_module_get_name(mod));
+ continue;
+ }
+ }
+
+ log_full_errno(!verbose ? LOG_DEBUG :
+ err == -ENODEV ? LOG_NOTICE :
+ err == -ENOENT ? LOG_WARNING :
+ LOG_ERR,
+ err,
+ "Failed to insert module '%s': %m",
+ kmod_module_get_name(mod));
+ if (!IN_SET(err, -ENODEV, -ENOENT))
+ r = err;
+ }
+ }
+ }
+
+ return r;
+}
diff --git a/src/shared/module-util.h b/src/shared/module-util.h
new file mode 100644
index 0000000..8ca6a06
--- /dev/null
+++ b/src/shared/module-util.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <libkmod.h>
+
+#include "macro.h"
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct kmod_ctx*, kmod_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct kmod_module*, kmod_module_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(struct kmod_list*, kmod_module_unref_list, NULL);
+
+int module_load_and_warn(struct kmod_ctx *ctx, const char *module, bool verbose);
diff --git a/src/shared/mount-setup.c b/src/shared/mount-setup.c
new file mode 100644
index 0000000..1226ca1
--- /dev/null
+++ b/src/shared/mount-setup.c
@@ -0,0 +1,591 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/statvfs.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "bus-util.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "conf-files.h"
+#include "dev-setup.h"
+#include "dirent-util.h"
+#include "efi-loader.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "label-util.h"
+#include "log.h"
+#include "macro.h"
+#include "mkdir-label.h"
+#include "mount-setup.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "nulstr-util.h"
+#include "path-util.h"
+#include "recurse-dir.h"
+#include "set.h"
+#include "smack-util.h"
+#include "strv.h"
+#include "user-util.h"
+#include "virt.h"
+
+typedef enum MountMode {
+ MNT_NONE = 0,
+ MNT_FATAL = 1 << 0,
+ MNT_IN_CONTAINER = 1 << 1,
+ MNT_CHECK_WRITABLE = 1 << 2,
+ MNT_FOLLOW_SYMLINK = 1 << 3,
+} MountMode;
+
+typedef struct MountPoint {
+ const char *what;
+ const char *where;
+ const char *type;
+ const char *options;
+ unsigned long flags;
+ bool (*condition_fn)(void);
+ MountMode mode;
+} MountPoint;
+
+/* The first three entries we might need before SELinux is up. The
+ * fourth (securityfs) is needed by IMA to load a custom policy. The
+ * other ones we can delay until SELinux and IMA are loaded. When
+ * SMACK is enabled we need smackfs, too, so it's a fifth one. */
+#if ENABLE_SMACK
+#define N_EARLY_MOUNT 5
+#else
+#define N_EARLY_MOUNT 4
+#endif
+
+static bool check_recursiveprot_supported(void) {
+ int r;
+
+ if (!cg_is_unified_wanted())
+ return false;
+
+ r = mount_option_supported("cgroup2", "memory_recursiveprot", NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to determiner whether the 'memory_recursiveprot' mount option is supported, assuming not: %m");
+ else if (r == 0)
+ log_debug("This kernel version does not support 'memory_recursiveprot', not using mount option.");
+
+ return r > 0;
+}
+
+static const MountPoint mount_table[] = {
+ { "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ NULL, MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK },
+ { "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ NULL, MNT_FATAL|MNT_IN_CONTAINER },
+ { "devtmpfs", "/dev", "devtmpfs", "mode=0755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME,
+ NULL, MNT_FATAL|MNT_IN_CONTAINER },
+ { "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ NULL, MNT_NONE },
+#if ENABLE_SMACK
+ { "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ mac_smack_use, MNT_FATAL },
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=01777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ mac_smack_use, MNT_FATAL },
+#endif
+ { "tmpfs", "/dev/shm", "tmpfs", "mode=01777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ NULL, MNT_FATAL|MNT_IN_CONTAINER },
+ { "devpts", "/dev/pts", "devpts", "mode=0620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
+ NULL, MNT_IN_CONTAINER },
+#if ENABLE_SMACK
+ { "tmpfs", "/run", "tmpfs", "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ mac_smack_use, MNT_FATAL },
+#endif
+ { "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
+ NULL, MNT_FATAL|MNT_IN_CONTAINER },
+ { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ check_recursiveprot_supported, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
+ { "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
+ { "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
+ { "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
+ cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
+ { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
+ { "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
+ { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ cg_is_legacy_wanted, MNT_IN_CONTAINER },
+ { "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
+#if ENABLE_PSTORE
+ { "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ NULL, MNT_NONE },
+#endif
+#if ENABLE_EFI
+ { "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ is_efi_boot, MNT_NONE },
+#endif
+ { "bpf", "/sys/fs/bpf", "bpf", "mode=0700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ NULL, MNT_NONE, },
+};
+
+assert_cc(N_EARLY_MOUNT <= ELEMENTSOF(mount_table));
+
+bool mount_point_is_api(const char *path) {
+ /* Checks if this mount point is considered "API", and hence
+ * should be ignored */
+
+ for (size_t i = 0; i < ELEMENTSOF(mount_table); i ++)
+ if (path_equal(path, mount_table[i].where))
+ return true;
+
+ return path_startswith(path, "/sys/fs/cgroup/");
+}
+
+bool mount_point_ignore(const char *path) {
+ /* These are API file systems that might be mounted by other software, we just list them here so that
+ * we know that we should ignore them. */
+ FOREACH_STRING(i,
+ /* SELinux file systems */
+ "/sys/fs/selinux",
+ /* Container bind mounts */
+ "/dev/console",
+ "/proc/kmsg",
+ "/proc/sys",
+ "/proc/sys/kernel/random/boot_id")
+ if (path_equal(path, i))
+ return true;
+
+ if (path_startswith(path, "/run/host")) /* All mounts passed in from the container manager are
+ * something we better ignore. */
+ return true;
+
+ return false;
+}
+
+static int mount_one(const MountPoint *p, bool relabel) {
+ int r, priority;
+
+ assert(p);
+
+ priority = (p->mode & MNT_FATAL) ? LOG_ERR : LOG_DEBUG;
+
+ if (p->condition_fn && !p->condition_fn())
+ return 0;
+
+ /* Relabel first, just in case */
+ if (relabel)
+ (void) label_fix(p->where, LABEL_IGNORE_ENOENT|LABEL_IGNORE_EROFS);
+
+ r = path_is_mount_point(p->where, NULL, AT_SYMLINK_FOLLOW);
+ if (r < 0 && r != -ENOENT) {
+ log_full_errno(priority, r, "Failed to determine whether %s is a mount point: %m", p->where);
+ return (p->mode & MNT_FATAL) ? r : 0;
+ }
+ if (r > 0)
+ return 0;
+
+ /* Skip securityfs in a container */
+ if (!(p->mode & MNT_IN_CONTAINER) && detect_container() > 0)
+ return 0;
+
+ /* The access mode here doesn't really matter too much, since
+ * the mounted file system will take precedence anyway. */
+ if (relabel)
+ (void) mkdir_p_label(p->where, 0755);
+ else
+ (void) mkdir_p(p->where, 0755);
+
+ log_debug("Mounting %s to %s of type %s with options %s.",
+ p->what,
+ p->where,
+ p->type,
+ strna(p->options));
+
+ if (FLAGS_SET(p->mode, MNT_FOLLOW_SYMLINK))
+ r = mount_follow_verbose(priority, p->what, p->where, p->type, p->flags, p->options);
+ else
+ r = mount_nofollow_verbose(priority, p->what, p->where, p->type, p->flags, p->options);
+ if (r < 0)
+ return (p->mode & MNT_FATAL) ? r : 0;
+
+ /* Relabel again, since we now mounted something fresh here */
+ if (relabel)
+ (void) label_fix(p->where, 0);
+
+ if (p->mode & MNT_CHECK_WRITABLE) {
+ if (access(p->where, W_OK) < 0) {
+ r = -errno;
+
+ (void) umount2(p->where, UMOUNT_NOFOLLOW);
+ (void) rmdir(p->where);
+
+ log_full_errno(priority, r, "Mount point %s not writable after mounting, undoing: %m", p->where);
+ return (p->mode & MNT_FATAL) ? r : 0;
+ }
+ }
+
+ return 1;
+}
+
+static int mount_points_setup(size_t n, bool loaded_policy) {
+ int ret = 0, r;
+
+ assert(n <= ELEMENTSOF(mount_table));
+
+ FOREACH_ARRAY(mp, mount_table, n) {
+ r = mount_one(mp, loaded_policy);
+ if (r != 0 && ret >= 0)
+ ret = r;
+ }
+
+ return ret;
+}
+
+int mount_setup_early(void) {
+ /* Do a minimal mount of /proc and friends to enable the most basic stuff, such as SELinux */
+ return mount_points_setup(N_EARLY_MOUNT, /* loaded_policy= */ false);
+}
+
+static const char *join_with(const char *controller) {
+
+ static const char* const pairs[] = {
+ "cpu", "cpuacct",
+ "net_cls", "net_prio",
+ NULL
+ };
+
+ assert(controller);
+
+ /* This will lookup which controller to mount another controller with. Input is a controller name, and output
+ * is the other controller name. The function works both ways: you can input one and get the other, and input
+ * the other to get the one. */
+
+ STRV_FOREACH_PAIR(x, y, pairs) {
+ if (streq(controller, *x))
+ return *y;
+ if (streq(controller, *y))
+ return *x;
+ }
+
+ return NULL;
+}
+
+static int symlink_controller(const char *target, const char *alias) {
+ const char *a;
+ int r;
+
+ assert(target);
+ assert(alias);
+
+ a = strjoina("/sys/fs/cgroup/", alias);
+
+ r = symlink_idempotent(target, a, false);
+ if (r < 0)
+ return log_error_errno(r, "Failed to create symlink %s: %m", a);
+
+#if HAVE_SMACK_RUN_LABEL
+ const char *p;
+
+ p = strjoina("/sys/fs/cgroup/", target);
+
+ r = mac_smack_copy(a, p);
+ if (r < 0 && !ERRNO_IS_NOT_SUPPORTED(r))
+ return log_error_errno(r, "Failed to copy smack label from %s to %s: %m", p, a);
+#endif
+
+ return 0;
+}
+
+int mount_cgroup_controllers(void) {
+ _cleanup_set_free_ Set *controllers = NULL;
+ int r;
+
+ if (!cg_is_legacy_wanted())
+ return 0;
+
+ /* Mount all available cgroup controllers that are built into the kernel. */
+ r = cg_kernel_controllers(&controllers);
+ if (r < 0)
+ return log_error_errno(r, "Failed to enumerate cgroup controllers: %m");
+
+ for (;;) {
+ _cleanup_free_ char *options = NULL, *controller = NULL, *where = NULL;
+ const char *other_controller;
+ MountPoint p = {
+ .what = "cgroup",
+ .type = "cgroup",
+ .flags = MS_NOSUID|MS_NOEXEC|MS_NODEV,
+ .mode = MNT_IN_CONTAINER,
+ };
+
+ controller = set_steal_first(controllers);
+ if (!controller)
+ break;
+
+ /* Check if we shall mount this together with another controller */
+ other_controller = join_with(controller);
+ if (other_controller) {
+ _cleanup_free_ char *c = NULL;
+
+ /* Check if the other controller is actually available in the kernel too */
+ c = set_remove(controllers, other_controller);
+ if (c) {
+
+ /* Join the two controllers into one string, and maintain a stable ordering */
+ if (strcmp(controller, other_controller) < 0)
+ options = strjoin(controller, ",", other_controller);
+ else
+ options = strjoin(other_controller, ",", controller);
+ if (!options)
+ return log_oom();
+ }
+ }
+
+ /* The simple case, where there's only one controller to mount together */
+ if (!options)
+ options = TAKE_PTR(controller);
+
+ where = path_join("/sys/fs/cgroup", options);
+ if (!where)
+ return log_oom();
+
+ p.where = where;
+ p.options = options;
+
+ r = mount_one(&p, true);
+ if (r < 0)
+ return r;
+
+ /* Create symlinks from the individual controller names, in case we have a joined mount */
+ if (controller)
+ (void) symlink_controller(options, controller);
+ if (other_controller)
+ (void) symlink_controller(options, other_controller);
+ }
+
+ /* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */
+ (void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs",
+ MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY,
+ "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP);
+
+ return 0;
+}
+
+#if HAVE_SELINUX || ENABLE_SMACK
+static int relabel_cb(
+ RecurseDirEvent event,
+ const char *path,
+ int dir_fd,
+ int inode_fd,
+ const struct dirent *de,
+ const struct statx *sx,
+ void *userdata) {
+
+ switch (event) {
+
+ case RECURSE_DIR_LEAVE:
+ case RECURSE_DIR_SKIP_MOUNT:
+ /* If we already saw this dirent when entering it or this is a dirent that on a different
+ * mount, don't relabel it. */
+ return RECURSE_DIR_CONTINUE;
+
+ case RECURSE_DIR_ENTER:
+ /* /run/initramfs/ + /run/nextroot/ are static data and big, no need to dynamically relabel
+ * its contents at boot... */
+ if (PATH_STARTSWITH_SET(path, "/run/initramfs", "/run/nextroot"))
+ return RECURSE_DIR_SKIP_ENTRY;
+
+ _fallthrough_;
+
+ default:
+ /* Otherwise, label it, even if we had trouble stat()ing it and similar. SELinux can figure this out */
+ (void) label_fix(path, 0);
+ return RECURSE_DIR_CONTINUE;
+ }
+}
+
+static int relabel_tree(const char *path) {
+ int r;
+
+ r = recurse_dir_at(AT_FDCWD, path, 0, UINT_MAX, RECURSE_DIR_ENSURE_TYPE|RECURSE_DIR_SAME_MOUNT, relabel_cb, NULL);
+ if (r < 0)
+ log_debug_errno(r, "Failed to recursively relabel '%s': %m", path);
+
+ return r;
+}
+
+static int relabel_cgroup_filesystems(void) {
+ int r;
+ struct statfs st;
+
+ r = cg_all_unified();
+ if (r == 0) {
+ /* Temporarily remount the root cgroup filesystem to give it a proper label. Do this
+ only when the filesystem has been already populated by a previous instance of systemd
+ running from initrd. Otherwise don't remount anything and leave the filesystem read-write
+ for the cgroup filesystems to be mounted inside. */
+ if (statfs("/sys/fs/cgroup", &st) < 0)
+ return log_error_errno(errno, "Failed to determine mount flags for /sys/fs/cgroup: %m");
+
+ if (st.f_flags & ST_RDONLY)
+ (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT, NULL);
+
+ (void) label_fix("/sys/fs/cgroup", 0);
+ (void) relabel_tree("/sys/fs/cgroup");
+
+ if (st.f_flags & ST_RDONLY)
+ (void) mount_nofollow(NULL, "/sys/fs/cgroup", NULL, MS_REMOUNT|MS_RDONLY, NULL);
+
+ } else if (r < 0)
+ return log_error_errno(r, "Failed to determine whether we are in all unified mode: %m");
+
+ return 0;
+}
+
+static int relabel_extra(void) {
+ _cleanup_strv_free_ char **files = NULL;
+ int r, c = 0;
+
+ /* Support for relabelling additional files or directories after loading the policy. For this, code in the
+ * initrd simply has to drop in *.relabel files into /run/systemd/relabel-extra.d/. We'll read all such files
+ * expecting one absolute path by line and will relabel each (and everyone below that in case the path refers
+ * to a directory). These drop-in files are supposed to be absolutely minimal, and do not understand comments
+ * and such. After the operation succeeded the files are removed, and the drop-in directory as well, if
+ * possible.
+ */
+
+ r = conf_files_list(&files, ".relabel", NULL,
+ CONF_FILES_FILTER_MASKED | CONF_FILES_REGULAR,
+ "/run/systemd/relabel-extra.d/");
+ if (r < 0)
+ return log_error_errno(r, "Failed to enumerate /run/systemd/relabel-extra.d/, ignoring: %m");
+
+ STRV_FOREACH(file, files) {
+ _cleanup_fclose_ FILE *f = NULL;
+
+ f = fopen(*file, "re");
+ if (!f) {
+ log_warning_errno(errno, "Failed to open %s, ignoring: %m", *file);
+ continue;
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0) {
+ log_warning_errno(r, "Failed to read %s, ignoring: %m", *file);
+ break;
+ }
+ if (r == 0) /* EOF */
+ break;
+
+ path_simplify(line);
+
+ if (!path_is_normalized(line)) {
+ log_warning("Path to relabel is not normalized, ignoring: %s", line);
+ continue;
+ }
+
+ if (!path_is_absolute(line)) {
+ log_warning("Path to relabel is not absolute, ignoring: %s", line);
+ continue;
+ }
+
+ log_debug("Relabelling additional file/directory '%s'.", line);
+ (void) label_fix(line, 0);
+ (void) relabel_tree(line);
+ c++;
+ }
+
+ if (unlink(*file) < 0)
+ log_warning_errno(errno, "Failed to remove %s, ignoring: %m", *file);
+ }
+
+ /* Remove when we complete things. */
+ if (rmdir("/run/systemd/relabel-extra.d") < 0 &&
+ errno != ENOENT)
+ log_warning_errno(errno, "Failed to remove /run/systemd/relabel-extra.d/ directory: %m");
+
+ return c;
+}
+#endif
+
+int mount_setup(bool loaded_policy, bool leave_propagation) {
+ int r;
+
+ r = mount_points_setup(ELEMENTSOF(mount_table), loaded_policy);
+ if (r < 0)
+ return r;
+
+#if HAVE_SELINUX || ENABLE_SMACK
+ /* Nodes in devtmpfs and /run need to be manually updated for
+ * the appropriate labels, after mounting. The other virtual
+ * API file systems like /sys and /proc do not need that, they
+ * use the same label for all their files. */
+ if (loaded_policy) {
+ usec_t before_relabel, after_relabel;
+ int n_extra;
+
+ before_relabel = now(CLOCK_MONOTONIC);
+
+ FOREACH_STRING(i, "/dev", "/dev/shm", "/run")
+ (void) relabel_tree(i);
+
+ (void) relabel_cgroup_filesystems();
+
+ n_extra = relabel_extra();
+
+ after_relabel = now(CLOCK_MONOTONIC);
+
+ log_info("Relabeled /dev, /dev/shm, /run, /sys/fs/cgroup%s in %s.",
+ n_extra > 0 ? ", additional files" : "",
+ FORMAT_TIMESPAN(after_relabel - before_relabel, 0));
+ }
+#endif
+
+ /* Create a few default symlinks, which are normally created
+ * by udevd, but some scripts might need them before we start
+ * udevd. */
+ dev_setup(NULL, UID_INVALID, GID_INVALID);
+
+ /* Mark the root directory as shared in regards to mount propagation. The kernel defaults to "private", but we
+ * think it makes more sense to have a default of "shared" so that nspawn and the container tools work out of
+ * the box. If specific setups need other settings they can reset the propagation mode to private if
+ * needed. Note that we set this only when we are invoked directly by the kernel. If we are invoked by a
+ * container manager we assume the container manager knows what it is doing (for example, because it set up
+ * some directories with different propagation modes). */
+ if (detect_container() <= 0 && !leave_propagation)
+ if (mount(NULL, "/", NULL, MS_REC|MS_SHARED, NULL) < 0)
+ log_warning_errno(errno, "Failed to set up the root directory for shared mount propagation: %m");
+
+ /* Create a few directories we always want around, Note that sd_booted() checks for /run/systemd/system, so
+ * this mkdir really needs to stay for good, otherwise software that copied sd-daemon.c into their sources will
+ * misdetect systemd. */
+ (void) mkdir_label("/run/systemd", 0755);
+ (void) mkdir_label("/run/systemd/system", 0755);
+
+ /* Make sure there's always a place where sandboxed environments can mount root file systems they are
+ * about to move into, even when unprivileged, without having to create a temporary one in /tmp/
+ * (which they then have to keep track of and clean) */
+ (void) mkdir_label("/run/systemd/mount-rootfs", 0555);
+
+ /* Make sure we have a mount point to hide in sandboxes */
+ (void) mkdir_label("/run/credentials", 0755);
+
+ /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
+ * inaccessible nodes from. If we run in a container the host might have created these for us already
+ * in /run/host/inaccessible/. Use those if we can, since that way we likely get access to block/char
+ * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
+ * userns outside the container and thus nicely read-only and not remountable. */
+ if (access("/run/host/inaccessible/", F_OK) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
+
+ (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
+ } else
+ (void) symlink("../host/inaccessible", "/run/systemd/inaccessible");
+
+ return 0;
+}
diff --git a/src/shared/mount-setup.h b/src/shared/mount-setup.h
new file mode 100644
index 0000000..29bd62f
--- /dev/null
+++ b/src/shared/mount-setup.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+int mount_setup_early(void);
+int mount_setup(bool loaded_policy, bool leave_propagation);
+
+int mount_cgroup_controllers(void);
+
+bool mount_point_is_api(const char *path);
+bool mount_point_ignore(const char *path);
diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c
new file mode 100644
index 0000000..4f2acce
--- /dev/null
+++ b/src/shared/mount-util.c
@@ -0,0 +1,1785 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/statvfs.h>
+#include <unistd.h>
+#include <linux/loop.h>
+#if WANT_LINUX_FS_H
+#include <linux/fs.h>
+#endif
+
+#include "alloc-util.h"
+#include "chase.h"
+#include "dissect-image.h"
+#include "exec-util.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "initrd-util.h"
+#include "label-util.h"
+#include "libmount-util.h"
+#include "missing_mount.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "namespace-util.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "set.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "user-util.h"
+
+int umount_recursive_full(const char *prefix, int flags, char **keep) {
+ _cleanup_fclose_ FILE *f = NULL;
+ int n = 0, r;
+
+ /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
+ * keep unmounting them until they are gone. */
+
+ f = fopen("/proc/self/mountinfo", "re"); /* Pin the file, in case we unmount /proc/ as part of the logic here */
+ if (!f)
+ return log_debug_errno(errno, "Failed to open /proc/self/mountinfo: %m");
+
+ for (;;) {
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+ bool again = false;
+
+ r = libmount_parse("/proc/self/mountinfo", f, &table, &iter);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
+
+ for (;;) {
+ bool shall_keep = false;
+ struct libmnt_fs *fs;
+ const char *path;
+
+ r = mnt_table_next_fs(table, iter, &fs);
+ if (r == 1)
+ break;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
+
+ path = mnt_fs_get_target(fs);
+ if (!path)
+ continue;
+
+ if (prefix && !path_startswith(path, prefix)) {
+ log_trace("Not unmounting %s, outside of prefix: %s", path, prefix);
+ continue;
+ }
+
+ STRV_FOREACH(k, keep)
+ /* Match against anything in the path to the dirs to keep, or below the dirs to keep */
+ if (path_startswith(path, *k) || path_startswith(*k, path)) {
+ shall_keep = true;
+ break;
+ }
+ if (shall_keep) {
+ log_debug("Not unmounting %s, referenced by keep list.", path);
+ continue;
+ }
+
+ if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
+ log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
+ continue;
+ }
+
+ log_trace("Successfully unmounted %s", path);
+
+ again = true;
+ n++;
+
+ break;
+ }
+
+ if (!again)
+ break;
+
+ rewind(f);
+ }
+
+ return n;
+}
+
+#define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW)
+
+static uint64_t ms_flags_to_mount_attr(unsigned long a) {
+ uint64_t f = 0;
+
+ if (FLAGS_SET(a, MS_RDONLY))
+ f |= MOUNT_ATTR_RDONLY;
+
+ if (FLAGS_SET(a, MS_NOSUID))
+ f |= MOUNT_ATTR_NOSUID;
+
+ if (FLAGS_SET(a, MS_NODEV))
+ f |= MOUNT_ATTR_NODEV;
+
+ if (FLAGS_SET(a, MS_NOEXEC))
+ f |= MOUNT_ATTR_NOEXEC;
+
+ if (FLAGS_SET(a, MS_NOSYMFOLLOW))
+ f |= MOUNT_ATTR_NOSYMFOLLOW;
+
+ return f;
+}
+
+static bool skip_mount_set_attr = false;
+
+/* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
+ * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
+int bind_remount_recursive_with_mountinfo(
+ const char *prefix,
+ unsigned long new_flags,
+ unsigned long flags_mask,
+ char **deny_list,
+ FILE *proc_self_mountinfo) {
+
+ _cleanup_fclose_ FILE *proc_self_mountinfo_opened = NULL;
+ _cleanup_set_free_ Set *done = NULL;
+ unsigned n_tries = 0;
+ int r;
+
+ assert(prefix);
+
+ if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && strv_isempty(deny_list) && !skip_mount_set_attr) {
+ /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
+
+ if (mount_setattr(AT_FDCWD, prefix, AT_SYMLINK_NOFOLLOW|AT_RECURSIVE,
+ &(struct mount_attr) {
+ .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
+ .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
+ }, MOUNT_ATTR_SIZE_VER0) < 0) {
+
+ log_debug_errno(errno, "mount_setattr() failed, falling back to classic remounting: %m");
+
+ /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We
+ * also do this for all other kinds of errors since they are so many different, and
+ * mount_setattr() has no graceful mode where it continues despite seeing errors one
+ * some mounts, but we want that. Moreover mount_setattr() only works on the mount
+ * point inode itself, not a non-mount point inode, and we want to support arbitrary
+ * prefixes here. */
+
+ if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
+ skip_mount_set_attr = true;
+ } else
+ return 0; /* Nice, this worked! */
+ }
+
+ if (!proc_self_mountinfo) {
+ r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened);
+ if (r < 0)
+ return r;
+
+ proc_self_mountinfo = proc_self_mountinfo_opened;
+ }
+
+ /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
+ * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
+ * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
+ * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
+ * access, too. When mounts are stacked on the same mount point we only care for each individual
+ * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
+ * not have any effect on future submounts that might get propagated, they might be writable
+ * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
+ * operate atomically here. Mounts established while we process the tree might or might not get
+ * noticed and thus might or might not be covered.
+ *
+ * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
+ * remount operation. Note that we'll ignore the deny list for the top-level path. */
+
+ for (;;) {
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+ _cleanup_hashmap_free_ Hashmap *todo = NULL;
+ bool top_autofs = false;
+
+ if (n_tries++ >= 32) /* Let's not retry this loop forever */
+ return -EBUSY;
+
+ rewind(proc_self_mountinfo);
+
+ r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
+
+ for (;;) {
+ _cleanup_free_ char *d = NULL;
+ const char *path, *type, *opts;
+ unsigned long flags = 0;
+ struct libmnt_fs *fs;
+
+ r = mnt_table_next_fs(table, iter, &fs);
+ if (r == 1) /* EOF */
+ break;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
+
+ path = mnt_fs_get_target(fs);
+ if (!path)
+ continue;
+
+ if (!path_startswith(path, prefix))
+ continue;
+
+ type = mnt_fs_get_fstype(fs);
+ if (!type)
+ continue;
+
+ /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
+ * triggering them, as we don't make any guarantees for future submounts anyway. If
+ * they are already triggered, then we will find another entry for this. */
+ if (streq(type, "autofs")) {
+ top_autofs = top_autofs || path_equal(path, prefix);
+ continue;
+ }
+
+ if (set_contains(done, path))
+ continue;
+
+ /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
+ * we shall operate on. */
+ if (!path_equal(path, prefix)) {
+ bool deny_listed = false;
+
+ STRV_FOREACH(i, deny_list) {
+ if (path_equal(*i, prefix))
+ continue;
+
+ if (!path_startswith(*i, prefix))
+ continue;
+
+ if (path_startswith(path, *i)) {
+ deny_listed = true;
+ log_trace("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
+ break;
+ }
+ }
+
+ if (deny_listed)
+ continue;
+ }
+
+ opts = mnt_fs_get_vfs_options(fs);
+ if (opts) {
+ r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
+ if (r < 0)
+ log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
+ }
+
+ d = strdup(path);
+ if (!d)
+ return -ENOMEM;
+
+ r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
+ if (r == -EEXIST)
+ /* If the same path was recorded, but with different mount flags, update it:
+ * it means a mount point is overmounted, and libmount returns the "bottom" (or
+ * older one) first, but we want to reapply the flags from the "top" (or newer
+ * one). See: https://github.com/systemd/systemd/issues/20032
+ * Note that this shouldn't really fail, as we were just told that the key
+ * exists, and it's an update so we want 'd' to be freed immediately. */
+ r = hashmap_update(todo, d, ULONG_TO_PTR(flags));
+ if (r < 0)
+ return r;
+ if (r > 0)
+ TAKE_PTR(d);
+ }
+
+ /* Check if the top-level directory was among what we have seen so far. For that check both
+ * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
+ * not include it in either set but will set this bool. */
+ if (!set_contains(done, prefix) &&
+ !(top_autofs || hashmap_contains(todo, prefix))) {
+
+ /* The prefix directory itself is not yet a mount, make it one. */
+ r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ /* Immediately rescan, so that we pick up the new mount's flags */
+ continue;
+ }
+
+ /* If we have no submounts to process anymore, we are done */
+ if (hashmap_isempty(todo))
+ return 0;
+
+ for (;;) {
+ unsigned long flags;
+ char *x = NULL;
+
+ /* Take the first mount from our list of mounts to still process */
+ flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
+ if (!x)
+ break;
+
+ r = set_ensure_consume(&done, &path_hash_ops_free, x);
+ if (IN_SET(r, 0, -EEXIST))
+ continue; /* Already done */
+ if (r < 0)
+ return r;
+
+ /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
+ * the default anyway, thus redundant, and in userns we'll get an error if we try to
+ * explicitly enable it) */
+ r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
+ if (r < 0) {
+ int q;
+
+ /* OK, so the remount of this entry failed. We'll ultimately ignore this in
+ * almost all cases (there are simply so many reasons why this can fail,
+ * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
+ * the very least. */
+
+ q = path_is_mount_point(x, NULL, 0);
+ if (IN_SET(q, 0, -ENOENT)) {
+ /* Hmm, whaaaa? The mount point is not actually a mount point? Then
+ * it is either obstructed by a later mount or somebody has been
+ * racing against us and removed it. Either way the mount point
+ * doesn't matter to us, let's ignore it hence. */
+ log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
+ continue;
+ }
+ if (q < 0) /* Any other error on this? Just log and continue */
+ log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
+
+ if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
+ log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
+ continue;
+ }
+
+ /* Make this fatal if this is the top-level mount */
+ if (path_equal(x, prefix))
+ return r;
+
+ /* If this is not the top-level mount, then handle this gracefully: log but
+ * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
+ * this might fail without a chance for us to do anything about it, let's
+ * hence be strict on the top-level mount and lenient on the inner ones. */
+ log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
+ continue;
+ }
+
+ log_trace("Remounted %s.", x);
+ }
+ }
+}
+
+int bind_remount_one_with_mountinfo(
+ const char *path,
+ unsigned long new_flags,
+ unsigned long flags_mask,
+ FILE *proc_self_mountinfo) {
+
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ unsigned long flags = 0;
+ struct libmnt_fs *fs;
+ const char *opts;
+ int r;
+
+ assert(path);
+ assert(proc_self_mountinfo);
+
+ if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && !skip_mount_set_attr) {
+ /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
+
+ if (mount_setattr(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW,
+ &(struct mount_attr) {
+ .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
+ .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
+ }, MOUNT_ATTR_SIZE_VER0) < 0) {
+
+ log_debug_errno(errno, "mount_setattr() didn't work, falling back to classic remounting: %m");
+
+ if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
+ skip_mount_set_attr = true;
+ } else
+ return 0; /* Nice, this worked! */
+ }
+
+ rewind(proc_self_mountinfo);
+
+ table = mnt_new_table();
+ if (!table)
+ return -ENOMEM;
+
+ r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
+ if (r < 0)
+ return r;
+
+ fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
+ if (!fs) {
+ if (laccess(path, F_OK) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */
+ return -errno;
+
+ return -EINVAL; /* Not a mount point we recognize */
+ }
+
+ opts = mnt_fs_get_vfs_options(fs);
+ if (opts) {
+ r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
+ if (r < 0)
+ log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
+ }
+
+ r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
+ if (r < 0) {
+ if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
+ * since kernel adds it in
+ * everywhere, because it's the
+ * default. */
+ return r;
+
+ /* Let's handle redundant remounts gracefully */
+ log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
+ }
+
+ return 0;
+}
+
+static int mount_switch_root_pivot(int fd_newroot, const char *path) {
+ assert(fd_newroot >= 0);
+ assert(path);
+
+ /* Change into the new rootfs. */
+ if (fchdir(fd_newroot) < 0)
+ return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
+
+ /* Let the kernel tuck the new root under the old one. */
+ if (pivot_root(".", ".") < 0)
+ return log_debug_errno(errno, "Failed to pivot root to new rootfs '%s': %m", path);
+
+ /* Get rid of the old root and reveal our brand new root. (This will always operate on the top-most
+ * mount on our cwd, regardless what our current directory actually points to.) */
+ if (umount2(".", MNT_DETACH) < 0)
+ return log_debug_errno(errno, "Failed to unmount old rootfs: %m");
+
+ return 0;
+}
+
+static int mount_switch_root_move(int fd_newroot, const char *path) {
+ assert(fd_newroot >= 0);
+ assert(path);
+
+ /* Change into the new rootfs. */
+ if (fchdir(fd_newroot) < 0)
+ return log_debug_errno(errno, "Failed to chdir into new rootfs '%s': %m", path);
+
+ /* Move the new root fs */
+ if (mount(".", "/", NULL, MS_MOVE, NULL) < 0)
+ return log_debug_errno(errno, "Failed to move new rootfs '%s': %m", path);
+
+ /* Also change root dir */
+ if (chroot(".") < 0)
+ return log_debug_errno(errno, "Failed to chroot to new rootfs '%s': %m", path);
+
+ return 0;
+}
+
+int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move) {
+ _cleanup_close_ int fd_newroot = -EBADF;
+ int r;
+
+ assert(path);
+ assert(mount_propagation_flag_is_valid(mount_propagation_flag));
+
+ fd_newroot = open(path, O_PATH|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW);
+ if (fd_newroot < 0)
+ return log_debug_errno(errno, "Failed to open new rootfs '%s': %m", path);
+
+ if (!force_ms_move) {
+ r = mount_switch_root_pivot(fd_newroot, path);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to pivot into new rootfs '%s', will try to use MS_MOVE instead: %m", path);
+ force_ms_move = true;
+ }
+ }
+ if (force_ms_move) {
+ /* Failed to pivot_root() fallback to MS_MOVE. For example, this may happen if the rootfs is
+ * an initramfs in which case pivot_root() isn't supported. */
+ r = mount_switch_root_move(fd_newroot, path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to switch to new rootfs '%s' with MS_MOVE: %m", path);
+ }
+
+ /* Finally, let's establish the requested propagation flags. */
+ if (mount_propagation_flag == 0)
+ return 0;
+
+ if (mount(NULL, ".", NULL, mount_propagation_flag | MS_REC, 0) < 0)
+ return log_debug_errno(errno, "Failed to turn new rootfs '%s' into %s mount: %m",
+ mount_propagation_flag_to_string(mount_propagation_flag), path);
+
+ return 0;
+}
+
+int repeat_unmount(const char *path, int flags) {
+ bool done = false;
+
+ assert(path);
+
+ /* If there are multiple mounts on a mount point, this
+ * removes them all */
+
+ for (;;) {
+ if (umount2(path, flags) < 0) {
+
+ if (errno == EINVAL)
+ return done;
+
+ return -errno;
+ }
+
+ done = true;
+ }
+}
+
+int mode_to_inaccessible_node(
+ const char *runtime_dir,
+ mode_t mode,
+ char **ret) {
+
+ /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
+ * during early boot by PID 1. In some cases we lacked the privs to create the character and block
+ * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
+ * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
+ * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
+ * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
+ * file nodes, and that's the most important thing that matters.
+ *
+ * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
+ * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
+
+ _cleanup_free_ char *d = NULL;
+ const char *node;
+
+ assert(ret);
+
+ if (!runtime_dir)
+ runtime_dir = "/run";
+
+ if (S_ISLNK(mode))
+ return -EINVAL;
+
+ node = inode_type_to_string(mode);
+ if (!node)
+ return -EINVAL;
+
+ d = path_join(runtime_dir, "systemd/inaccessible", node);
+ if (!d)
+ return -ENOMEM;
+
+ /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
+ * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
+ * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
+ * inaccessible block device node let's see if the block device node actually exists, and if not,
+ * fall back to the character device node. From there fall back to the socket device node. This means
+ * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
+ * device node at all. */
+
+ if (S_ISBLK(mode) &&
+ access(d, F_OK) < 0 && errno == ENOENT) {
+ free(d);
+ d = path_join(runtime_dir, "/systemd/inaccessible/chr");
+ if (!d)
+ return -ENOMEM;
+ }
+
+ if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
+ access(d, F_OK) < 0 && errno == ENOENT) {
+ free(d);
+ d = path_join(runtime_dir, "/systemd/inaccessible/sock");
+ if (!d)
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(d);
+ return 0;
+}
+
+int mount_flags_to_string(unsigned long flags, char **ret) {
+ static const struct {
+ unsigned long flag;
+ const char *name;
+ } map[] = {
+ { .flag = MS_RDONLY, .name = "MS_RDONLY", },
+ { .flag = MS_NOSUID, .name = "MS_NOSUID", },
+ { .flag = MS_NODEV, .name = "MS_NODEV", },
+ { .flag = MS_NOEXEC, .name = "MS_NOEXEC", },
+ { .flag = MS_SYNCHRONOUS, .name = "MS_SYNCHRONOUS", },
+ { .flag = MS_REMOUNT, .name = "MS_REMOUNT", },
+ { .flag = MS_MANDLOCK, .name = "MS_MANDLOCK", },
+ { .flag = MS_DIRSYNC, .name = "MS_DIRSYNC", },
+ { .flag = MS_NOSYMFOLLOW, .name = "MS_NOSYMFOLLOW", },
+ { .flag = MS_NOATIME, .name = "MS_NOATIME", },
+ { .flag = MS_NODIRATIME, .name = "MS_NODIRATIME", },
+ { .flag = MS_BIND, .name = "MS_BIND", },
+ { .flag = MS_MOVE, .name = "MS_MOVE", },
+ { .flag = MS_REC, .name = "MS_REC", },
+ { .flag = MS_SILENT, .name = "MS_SILENT", },
+ { .flag = MS_POSIXACL, .name = "MS_POSIXACL", },
+ { .flag = MS_UNBINDABLE, .name = "MS_UNBINDABLE", },
+ { .flag = MS_PRIVATE, .name = "MS_PRIVATE", },
+ { .flag = MS_SLAVE, .name = "MS_SLAVE", },
+ { .flag = MS_SHARED, .name = "MS_SHARED", },
+ { .flag = MS_RELATIME, .name = "MS_RELATIME", },
+ { .flag = MS_KERNMOUNT, .name = "MS_KERNMOUNT", },
+ { .flag = MS_I_VERSION, .name = "MS_I_VERSION", },
+ { .flag = MS_STRICTATIME, .name = "MS_STRICTATIME", },
+ { .flag = MS_LAZYTIME, .name = "MS_LAZYTIME", },
+ };
+ _cleanup_free_ char *str = NULL;
+
+ assert(ret);
+
+ for (size_t i = 0; i < ELEMENTSOF(map); i++)
+ if (flags & map[i].flag) {
+ if (!strextend_with_separator(&str, "|", map[i].name))
+ return -ENOMEM;
+ flags &= ~map[i].flag;
+ }
+
+ if (!str || flags != 0)
+ if (strextendf_with_separator(&str, "|", "%lx", flags) < 0)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(str);
+ return 0;
+}
+
+int mount_verbose_full(
+ int error_log_level,
+ const char *what,
+ const char *where,
+ const char *type,
+ unsigned long flags,
+ const char *options,
+ bool follow_symlink) {
+
+ _cleanup_free_ char *fl = NULL, *o = NULL;
+ unsigned long f;
+ int r;
+
+ r = mount_option_mangle(options, flags, &f, &o);
+ if (r < 0)
+ return log_full_errno(error_log_level, r,
+ "Failed to mangle mount options %s: %m",
+ strempty(options));
+
+ (void) mount_flags_to_string(f, &fl);
+
+ if (FLAGS_SET(f, MS_REMOUNT|MS_BIND))
+ log_debug("Changing mount flags %s (%s \"%s\")...",
+ where, strnull(fl), strempty(o));
+ else if (f & MS_REMOUNT)
+ log_debug("Remounting superblock %s (%s \"%s\")...",
+ where, strnull(fl), strempty(o));
+ else if (f & (MS_SHARED|MS_PRIVATE|MS_SLAVE|MS_UNBINDABLE))
+ log_debug("Changing mount propagation %s (%s \"%s\")",
+ where, strnull(fl), strempty(o));
+ else if (f & MS_BIND)
+ log_debug("Bind-mounting %s on %s (%s \"%s\")...",
+ what, where, strnull(fl), strempty(o));
+ else if (f & MS_MOVE)
+ log_debug("Moving mount %s %s %s (%s \"%s\")...",
+ what, special_glyph(SPECIAL_GLYPH_ARROW_RIGHT), where, strnull(fl), strempty(o));
+ else
+ log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
+ strna(what), strna(type), where, strnull(fl), strempty(o));
+
+ if (follow_symlink)
+ r = RET_NERRNO(mount(what, where, type, f, o));
+ else
+ r = mount_nofollow(what, where, type, f, o);
+ if (r < 0)
+ return log_full_errno(error_log_level, r,
+ "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
+ strna(what), strna(type), where, strnull(fl), strempty(o));
+ return 0;
+}
+
+int umount_verbose(
+ int error_log_level,
+ const char *what,
+ int flags) {
+
+ assert(what);
+
+ log_debug("Umounting %s...", what);
+
+ if (umount2(what, flags) < 0)
+ return log_full_errno(error_log_level, errno,
+ "Failed to unmount %s: %m", what);
+
+ return 0;
+}
+
+int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath) {
+ int r;
+
+ assert(fsmount_fd >= 0);
+ assert(dest);
+
+ /* First, try to mount beneath an existing mount point, and if that works, umount the old mount,
+ * which is now at the top. This will ensure we can atomically replace a mount. Note that this works
+ * also in the case where there are submounts down the tree. Mount propagation is allowed but
+ * restricted to layouts that don't end up propagation the new mount on top of the mount stack. If
+ * this is not supported (minimum kernel v6.5), or if there is no mount on the mountpoint, we get
+ * -EINVAL and then we fallback to normal mounting. */
+
+ r = RET_NERRNO(move_mount(
+ fsmount_fd,
+ /* from_path= */ "",
+ /* to_fd= */ -EBADF,
+ dest,
+ MOVE_MOUNT_F_EMPTY_PATH | (mount_beneath ? MOVE_MOUNT_BENEATH : 0)));
+ if (mount_beneath) {
+ if (r == -EINVAL) { /* Fallback if mount_beneath is not supported */
+ log_debug_errno(r,
+ "Failed to mount beneath '%s', falling back to overmount",
+ dest);
+ return RET_NERRNO(move_mount(
+ fsmount_fd,
+ /* from_path= */ "",
+ /* to_fd= */ -EBADF,
+ dest,
+ MOVE_MOUNT_F_EMPTY_PATH));
+ }
+
+ if (r >= 0) /* If it is, now remove the old mount */
+ return umount_verbose(LOG_DEBUG, dest, UMOUNT_NOFOLLOW|MNT_DETACH);
+ }
+
+ return r;
+}
+
+int mount_option_mangle(
+ const char *options,
+ unsigned long mount_flags,
+ unsigned long *ret_mount_flags,
+ char **ret_remaining_options) {
+
+ const struct libmnt_optmap *map;
+ _cleanup_free_ char *ret = NULL;
+ int r;
+
+ /* This extracts mount flags from the mount options, and stores
+ * non-mount-flag options to '*ret_remaining_options'.
+ * E.g.,
+ * "rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000"
+ * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
+ * "size=1630748k,mode=0700,uid=1000,gid=1000".
+ * See more examples in test-mount-util.c.
+ *
+ * If 'options' does not contain any non-mount-flag options,
+ * then '*ret_remaining_options' is set to NULL instead of empty string.
+ * The validity of options stored in '*ret_remaining_options' is not checked.
+ * If 'options' is NULL, this just copies 'mount_flags' to *ret_mount_flags. */
+
+ assert(ret_mount_flags);
+ assert(ret_remaining_options);
+
+ map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
+ if (!map)
+ return -EINVAL;
+
+ for (const char *p = options;;) {
+ _cleanup_free_ char *word = NULL;
+ const struct libmnt_optmap *ent;
+
+ r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ for (ent = map; ent->name; ent++) {
+ /* All entries in MNT_LINUX_MAP do not take any argument.
+ * Thus, ent->name does not contain "=" or "[=]". */
+ if (!streq(word, ent->name))
+ continue;
+
+ if (!(ent->mask & MNT_INVERT))
+ mount_flags |= ent->id;
+ else if (mount_flags & ent->id)
+ mount_flags ^= ent->id;
+
+ break;
+ }
+
+ /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
+ if (!ent->name &&
+ !startswith_no_case(word, "x-") &&
+ !strextend_with_separator(&ret, ",", word))
+ return -ENOMEM;
+ }
+
+ *ret_mount_flags = mount_flags;
+ *ret_remaining_options = TAKE_PTR(ret);
+
+ return 0;
+}
+
+static int mount_in_namespace_legacy(
+ const char *chased_src_path,
+ int chased_src_fd,
+ struct stat *chased_src_st,
+ const char *propagate_path,
+ const char *incoming_path,
+ const char *dest,
+ int pidns_fd,
+ int mntns_fd,
+ int root_fd,
+ bool read_only,
+ bool make_file_or_directory,
+ const MountOptions *options,
+ const ImagePolicy *image_policy,
+ bool is_image) {
+
+ _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
+ char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p;
+ bool mount_slave_created = false, mount_slave_mounted = false,
+ mount_tmp_created = false, mount_tmp_mounted = false,
+ mount_outside_created = false, mount_outside_mounted = false;
+ pid_t child;
+ int r;
+
+ assert(chased_src_path);
+ assert(chased_src_fd >= 0);
+ assert(chased_src_st);
+ assert(propagate_path);
+ assert(incoming_path);
+ assert(dest);
+ assert(pidns_fd >= 0);
+ assert(mntns_fd >= 0);
+ assert(root_fd >= 0);
+ assert(!options || is_image);
+
+ p = strjoina(propagate_path, "/");
+ r = laccess(p, F_OK);
+ if (r < 0)
+ return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
+
+ /* Our goal is to install a new bind mount into the container,
+ possibly read-only. This is irritatingly complex
+ unfortunately, currently.
+
+ First, we start by creating a private playground in /tmp,
+ that we can mount MS_SLAVE. (Which is necessary, since
+ MS_MOVE cannot be applied to mounts with MS_SHARED parent
+ mounts.) */
+
+ if (!mkdtemp(mount_slave))
+ return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
+
+ mount_slave_created = true;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto finish;
+
+ mount_slave_mounted = true;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
+ if (r < 0)
+ goto finish;
+
+ /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
+ mount_tmp = strjoina(mount_slave, "/mount");
+ if (is_image)
+ r = mkdir_p(mount_tmp, 0700);
+ else
+ r = make_mount_point_inode_from_stat(chased_src_st, mount_tmp, 0700);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
+ goto finish;
+ }
+
+ mount_tmp_created = true;
+
+ if (is_image)
+ r = verity_dissect_and_mount(
+ chased_src_fd,
+ chased_src_path,
+ mount_tmp,
+ options,
+ image_policy,
+ /* required_host_os_release_id= */ NULL,
+ /* required_host_os_release_version_id= */ NULL,
+ /* required_host_os_release_sysext_level= */ NULL,
+ /* required_host_os_release_confext_level= */ NULL,
+ /* required_sysext_scope= */ NULL,
+ /* ret_image= */ NULL);
+ else
+ r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(chased_src_fd), mount_tmp, NULL, MS_BIND, NULL);
+ if (r < 0)
+ goto finish;
+
+ mount_tmp_mounted = true;
+
+ /* Third, we remount the new bind mount read-only if requested. */
+ if (read_only) {
+ r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
+ if (r < 0)
+ goto finish;
+ }
+
+ /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
+ * right-away. */
+
+ mount_outside = strjoina(propagate_path, "/XXXXXX");
+ if (is_image || S_ISDIR(chased_src_st->st_mode))
+ r = mkdtemp(mount_outside) ? 0 : -errno;
+ else {
+ r = mkostemp_safe(mount_outside);
+ safe_close(r);
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
+ goto finish;
+ }
+
+ mount_outside_created = true;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto finish;
+
+ mount_outside_mounted = true;
+ mount_tmp_mounted = false;
+
+ if (is_image || S_ISDIR(chased_src_st->st_mode))
+ (void) rmdir(mount_tmp);
+ else
+ (void) unlink(mount_tmp);
+ mount_tmp_created = false;
+
+ (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
+ mount_slave_mounted = false;
+
+ (void) rmdir(mount_slave);
+ mount_slave_created = false;
+
+ if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
+ log_debug_errno(errno, "Failed to create pipe: %m");
+ goto finish;
+ }
+
+ r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
+ pidns_fd, mntns_fd, -1, -1, root_fd, &child);
+ if (r < 0)
+ goto finish;
+ if (r == 0) {
+ _cleanup_free_ char *mount_outside_fn = NULL, *mount_inside = NULL;
+
+ errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
+
+ if (make_file_or_directory) {
+ if (!is_image) {
+ (void) mkdir_parents(dest, 0755);
+ (void) make_mount_point_inode_from_stat(chased_src_st, dest, 0700);
+ } else
+ (void) mkdir_p(dest, 0755);
+ }
+
+ /* Fifth, move the mount to the right place inside */
+ r = path_extract_filename(mount_outside, &mount_outside_fn);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to extract filename from propagation file or directory '%s': %m", mount_outside);
+ goto child_fail;
+ }
+
+ mount_inside = path_join(incoming_path, mount_outside_fn);
+ if (!mount_inside) {
+ r = log_oom_debug();
+ goto child_fail;
+ }
+
+ r = mount_nofollow_verbose(LOG_DEBUG, mount_inside, dest, NULL, MS_MOVE, NULL);
+ if (r < 0)
+ goto child_fail;
+
+ _exit(EXIT_SUCCESS);
+
+ child_fail:
+ (void) write(errno_pipe_fd[1], &r, sizeof(r));
+ errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
+
+ _exit(EXIT_FAILURE);
+ }
+
+ errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
+
+ r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to wait for child: %m");
+ goto finish;
+ }
+ if (r != EXIT_SUCCESS) {
+ if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
+ log_debug_errno(r, "Failed to mount: %m");
+ else
+ log_debug("Child failed.");
+ goto finish;
+ }
+
+finish:
+ if (mount_outside_mounted)
+ (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
+ if (mount_outside_created) {
+ if (is_image || S_ISDIR(chased_src_st->st_mode))
+ (void) rmdir(mount_outside);
+ else
+ (void) unlink(mount_outside);
+ }
+
+ if (mount_tmp_mounted)
+ (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
+ if (mount_tmp_created) {
+ if (is_image || S_ISDIR(chased_src_st->st_mode))
+ (void) rmdir(mount_tmp);
+ else
+ (void) unlink(mount_tmp);
+ }
+
+ if (mount_slave_mounted)
+ (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
+ if (mount_slave_created)
+ (void) rmdir(mount_slave);
+
+ return r;
+}
+
+static int mount_in_namespace(
+ const PidRef *target,
+ const char *propagate_path,
+ const char *incoming_path,
+ const char *src,
+ const char *dest,
+ bool read_only,
+ bool make_file_or_directory,
+ const MountOptions *options,
+ const ImagePolicy *image_policy,
+ bool is_image) {
+
+ _cleanup_(dissected_image_unrefp) DissectedImage *img = NULL;
+ _cleanup_close_pair_ int errno_pipe_fd[2] = EBADF_PAIR;
+ _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, pidns_fd = -EBADF, chased_src_fd = -EBADF,
+ new_mount_fd = -EBADF;
+ _cleanup_free_ char *chased_src_path = NULL;
+ struct stat st;
+ pid_t child;
+ int r;
+
+ assert(propagate_path);
+ assert(incoming_path);
+ assert(src);
+ assert(dest);
+ assert(!options || is_image);
+
+ if (!pidref_is_set(target))
+ return -ESRCH;
+
+ r = namespace_open(target->pid, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
+
+ r = in_same_namespace(target->pid, 0, NAMESPACE_MOUNT);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine if mount namespaces are equal: %m");
+ /* We can't add new mounts at runtime if the process wasn't started in a namespace */
+ if (r > 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
+
+ r = pidref_verify(target);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to verify target process '" PID_FMT "': %m", target->pid);
+
+ r = chase(src, NULL, 0, &chased_src_path, &chased_src_fd);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
+ log_debug("Chased source path of %s to %s", src, chased_src_path);
+
+ if (fstat(chased_src_fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
+ if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
+
+ if (!mount_new_api_supported()) /* Fallback if we can't use the new mount API */
+ return mount_in_namespace_legacy(
+ chased_src_path,
+ chased_src_fd,
+ &st,
+ propagate_path,
+ incoming_path,
+ dest,
+ pidns_fd,
+ mntns_fd,
+ root_fd,
+ read_only,
+ make_file_or_directory,
+ options,
+ image_policy,
+ is_image);
+
+ if (is_image) {
+ r = verity_dissect_and_mount(
+ chased_src_fd,
+ chased_src_path,
+ /* dest= */ NULL,
+ options,
+ image_policy,
+ /* required_host_os_release_id= */ NULL,
+ /* required_host_os_release_version_id= */ NULL,
+ /* required_host_os_release_sysext_level= */ NULL,
+ /* required_host_os_release_confext_level= */ NULL,
+ /* required_sysext_scope= */ NULL,
+ &img);
+ if (r < 0)
+ return log_debug_errno(
+ r,
+ "Failed to dissect and mount image %s: %m",
+ chased_src_path);
+ } else {
+ new_mount_fd = open_tree(
+ chased_src_fd,
+ "",
+ OPEN_TREE_CLONE|OPEN_TREE_CLOEXEC|AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH);
+ if (new_mount_fd < 0)
+ return log_debug_errno(
+ errno,
+ "Failed to open mount point \"%s\": %m",
+ chased_src_path);
+
+ if (read_only && mount_setattr(new_mount_fd, "", AT_EMPTY_PATH,
+ &(struct mount_attr) {
+ .attr_set = MOUNT_ATTR_RDONLY,
+ }, MOUNT_ATTR_SIZE_VER0) < 0)
+ return log_debug_errno(
+ errno,
+ "Failed to set mount flags for \"%s\": %m",
+ chased_src_path);
+ }
+
+ if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0)
+ return log_debug_errno(errno, "Failed to create pipe: %m");
+
+ r = namespace_fork("(sd-bindmnt)",
+ "(sd-bindmnt-inner)",
+ /* except_fds= */ NULL,
+ /* n_except_fds= */ 0,
+ FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
+ pidns_fd,
+ mntns_fd,
+ /* netns_fd= */ -1,
+ /* userns_fd= */ -1,
+ root_fd,
+ &child);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to fork off: %m");
+ if (r == 0) {
+ errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
+
+ if (make_file_or_directory)
+ (void) mkdir_parents(dest, 0755);
+
+ if (img) {
+ DissectImageFlags f = DISSECT_IMAGE_TRY_ATOMIC_MOUNT_EXCHANGE;
+
+ if (make_file_or_directory)
+ f |= DISSECT_IMAGE_MKDIR;
+
+ if (read_only)
+ f |= DISSECT_IMAGE_READ_ONLY;
+
+ r = dissected_image_mount(
+ img,
+ dest,
+ /* uid_shift= */ UID_INVALID,
+ /* uid_range= */ UID_INVALID,
+ /* userns_fd= */ -EBADF,
+ f);
+ } else {
+ if (make_file_or_directory)
+ (void) make_mount_point_inode_from_stat(&st, dest, 0700);
+
+ r = mount_exchange_graceful(new_mount_fd, dest, /* mount_beneath= */ true);
+ }
+ if (r < 0) {
+ (void) write(errno_pipe_fd[1], &r, sizeof(r));
+ errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
+
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
+
+ r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to wait for child: %m");
+ if (r != EXIT_SUCCESS) {
+ if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
+ return log_debug_errno(r, "Failed to mount: %m");
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Child failed.");
+ }
+
+ return 0;
+}
+
+int bind_mount_in_namespace(
+ PidRef * target,
+ const char *propagate_path,
+ const char *incoming_path,
+ const char *src,
+ const char *dest,
+ bool read_only,
+ bool make_file_or_directory) {
+
+ return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, /* options= */ NULL, /* image_policy= */ NULL, /* is_image= */ false);
+}
+
+int mount_image_in_namespace(
+ PidRef * target,
+ const char *propagate_path,
+ const char *incoming_path,
+ const char *src,
+ const char *dest,
+ bool read_only,
+ bool make_file_or_directory,
+ const MountOptions *options,
+ const ImagePolicy *image_policy) {
+
+ return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, image_policy, /* is_image=*/ true);
+}
+
+int make_mount_point(const char *path) {
+ int r;
+
+ assert(path);
+
+ /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */
+
+ r = path_is_mount_point(path, NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine whether '%s' is a mount point: %m", path);
+ if (r > 0)
+ return 0;
+
+ r = mount_nofollow_verbose(LOG_DEBUG, path, path, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int fd_make_mount_point(int fd) {
+ int r;
+
+ assert(fd >= 0);
+
+ r = fd_is_mount_point(fd, NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine whether file descriptor is a mount point: %m");
+ if (r > 0)
+ return 0;
+
+ r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(fd), FORMAT_PROC_FD_PATH(fd), NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
+ _cleanup_close_ int userns_fd = -EBADF;
+ _cleanup_free_ char *line = NULL;
+
+ /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
+ * process whose only purpose is to give us a new user namespace. It's killed when we got it. */
+
+ if (!userns_shift_range_valid(uid_shift, uid_range))
+ return -EINVAL;
+
+ if (IN_SET(idmapping, REMOUNT_IDMAPPING_NONE, REMOUNT_IDMAPPING_HOST_ROOT)) {
+ if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
+ return log_oom_debug();
+
+ /* If requested we'll include an entry in the mapping so that the host root user can make
+ * changes to the uidmapped mount like it normally would. Specifically, we'll map the user
+ * with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
+ * to create various missing inodes in the OS tree before booting into it, and this becomes
+ * very easy and straightforward to do if it can just do it under its own regular UID. Note
+ * that in that case the container's runtime uidmap (i.e. the one the container payload
+ * processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
+ * by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
+ * which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
+ * to the container's own UID range, but it's good to have a safety net, in case we
+ * forget it.) */
+ if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
+ if (strextendf(&line,
+ UID_FMT " " UID_FMT " " UID_FMT "\n",
+ UID_MAPPED_ROOT, 0u, 1u) < 0)
+ return log_oom_debug();
+ }
+
+ if (idmapping == REMOUNT_IDMAPPING_HOST_OWNER) {
+ /* Remap the owner of the bind mounted directory to the root user within the container. This
+ * way every file written by root within the container to the bind-mounted directory will
+ * be owned by the original user. All other user will remain unmapped. */
+ if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", owner, uid_shift, 1u) < 0)
+ return log_oom_debug();
+ }
+
+ /* We always assign the same UID and GID ranges */
+ userns_fd = userns_acquire(line, line);
+ if (userns_fd < 0)
+ return log_debug_errno(userns_fd, "Failed to acquire new userns: %m");
+
+ return TAKE_FD(userns_fd);
+}
+
+int remount_idmap_fd(
+ char **paths,
+ int userns_fd) {
+
+ int r;
+
+ assert(userns_fd >= 0);
+
+ /* This remounts all specified paths with the specified userns as idmap. It will do so in in the
+ * order specified in the strv: the expectation is that the top-level directories are at the
+ * beginning, and nested directories in the right, so that the tree can be built correctly from left
+ * to right. */
+
+ size_t n = strv_length(paths);
+ if (n == 0) /* Nothing to do? */
+ return 0;
+
+ int *mount_fds = NULL;
+ size_t n_mounts_fds = 0;
+
+ mount_fds = new(int, n);
+ if (!mount_fds)
+ return log_oom_debug();
+
+ CLEANUP_ARRAY(mount_fds, n_mounts_fds, close_many_and_free);
+
+ for (size_t i = 0; i < n; i++) {
+ int mntfd;
+
+ /* Clone the mount point */
+ mntfd = mount_fds[n_mounts_fds] = open_tree(-EBADF, paths[i], OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
+ if (mount_fds[n_mounts_fds] < 0)
+ return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", paths[i]);
+
+ n_mounts_fds++;
+
+ /* Set the user namespace mapping attribute on the cloned mount point */
+ if (mount_setattr(mntfd, "", AT_EMPTY_PATH,
+ &(struct mount_attr) {
+ .attr_set = MOUNT_ATTR_IDMAP,
+ .userns_fd = userns_fd,
+ }, sizeof(struct mount_attr)) < 0)
+ return log_debug_errno(errno, "Failed to change bind mount attributes for clone of '%s': %m", paths[i]);
+ }
+
+ for (size_t i = n; i > 0; i--) { /* Unmount the paths right-to-left */
+ /* Remove the old mount points now that we have a idmapped mounts as replacement for all of them */
+ r = umount_verbose(LOG_DEBUG, paths[i-1], UMOUNT_NOFOLLOW);
+ if (r < 0)
+ return r;
+ }
+
+ for (size_t i = 0; i < n; i++) { /* Mount the replacement mounts left-to-right */
+ /* And place the cloned version in its place */
+ log_debug("Mounting idmapped fs to '%s'", paths[i]);
+ if (move_mount(mount_fds[i], "", -EBADF, paths[i], MOVE_MOUNT_F_EMPTY_PATH) < 0)
+ return log_debug_errno(errno, "Failed to attach UID mapped mount to '%s': %m", paths[i]);
+ }
+
+ return 0;
+}
+
+int remount_idmap(char **p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
+ _cleanup_close_ int userns_fd = -EBADF;
+
+ userns_fd = make_userns(uid_shift, uid_range, owner, idmapping);
+ if (userns_fd < 0)
+ return userns_fd;
+
+ return remount_idmap_fd(p, userns_fd);
+}
+
+typedef struct SubMount {
+ char *path;
+ int mount_fd;
+} SubMount;
+
+static void sub_mount_clear(SubMount *s) {
+ assert(s);
+
+ s->path = mfree(s->path);
+ s->mount_fd = safe_close(s->mount_fd);
+}
+
+static void sub_mount_array_free(SubMount *s, size_t n) {
+ assert(s || n == 0);
+
+ for (size_t i = 0; i < n; i++)
+ sub_mount_clear(s + i);
+
+ free(s);
+}
+
+static int sub_mount_compare(const SubMount *a, const SubMount *b) {
+ assert(a);
+ assert(b);
+ assert(a->path);
+ assert(b->path);
+
+ return path_compare(a->path, b->path);
+}
+
+static void sub_mount_drop(SubMount *s, size_t n) {
+ assert(s || n == 0);
+
+ for (size_t m = 0, i = 1; i < n; i++) {
+ if (path_startswith(s[i].path, s[m].path))
+ sub_mount_clear(s + i);
+ else
+ m = i;
+ }
+}
+
+static int get_sub_mounts(
+ const char *prefix,
+ SubMount **ret_mounts,
+ size_t *ret_n_mounts) {
+ _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
+ _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
+ SubMount *mounts = NULL;
+ size_t n = 0;
+ int r;
+
+ CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
+
+ assert(prefix);
+ assert(ret_mounts);
+ assert(ret_n_mounts);
+
+ r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
+
+ for (;;) {
+ _cleanup_close_ int mount_fd = -EBADF;
+ _cleanup_free_ char *p = NULL;
+ struct libmnt_fs *fs;
+ const char *path;
+ int id1, id2;
+
+ r = mnt_table_next_fs(table, iter, &fs);
+ if (r == 1)
+ break; /* EOF */
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
+
+ path = mnt_fs_get_target(fs);
+ if (!path)
+ continue;
+
+ if (isempty(path_startswith(path, prefix)))
+ continue;
+
+ id1 = mnt_fs_get_id(fs);
+ r = path_get_mnt_id(path, &id2);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to get mount ID of '%s', ignoring: %m", path);
+ continue;
+ }
+ if (id1 != id2) {
+ /* The path may be hidden by another over-mount or already remounted. */
+ log_debug("The mount IDs of '%s' obtained by libmount and path_get_mnt_id() are different (%i vs %i), ignoring.",
+ path, id1, id2);
+ continue;
+ }
+
+ mount_fd = open(path, O_CLOEXEC|O_PATH);
+ if (mount_fd < 0) {
+ if (errno == ENOENT) /* The path may be hidden by another over-mount or already unmounted. */
+ continue;
+
+ return log_debug_errno(errno, "Failed to open subtree of mounted filesystem '%s': %m", path);
+ }
+
+ p = strdup(path);
+ if (!p)
+ return log_oom_debug();
+
+ if (!GREEDY_REALLOC(mounts, n + 1))
+ return log_oom_debug();
+
+ mounts[n++] = (SubMount) {
+ .path = TAKE_PTR(p),
+ .mount_fd = TAKE_FD(mount_fd),
+ };
+ }
+
+ typesafe_qsort(mounts, n, sub_mount_compare);
+ sub_mount_drop(mounts, n);
+
+ *ret_mounts = TAKE_PTR(mounts);
+ *ret_n_mounts = n;
+ return 0;
+}
+
+int bind_mount_submounts(
+ const char *source,
+ const char *target) {
+
+ SubMount *mounts = NULL;
+ size_t n = 0;
+ int ret = 0, r;
+
+ /* Bind mounts all child mounts of 'source' to 'target'. Useful when setting up a new procfs instance
+ * with new mount options to copy the original submounts over. */
+
+ assert(source);
+ assert(target);
+
+ CLEANUP_ARRAY(mounts, n, sub_mount_array_free);
+
+ r = get_sub_mounts(source, &mounts, &n);
+ if (r < 0)
+ return r;
+
+ FOREACH_ARRAY(m, mounts, n) {
+ _cleanup_free_ char *t = NULL;
+ const char *suffix;
+
+ if (isempty(m->path))
+ continue;
+
+ assert_se(suffix = path_startswith(m->path, source));
+
+ t = path_join(target, suffix);
+ if (!t)
+ return -ENOMEM;
+
+ r = path_is_mount_point(t, NULL, 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to detect if '%s' already is a mount point, ignoring: %m", t);
+ continue;
+ }
+ if (r > 0) {
+ log_debug("Not bind mounting '%s' from '%s' to '%s', since there's already a mountpoint.", suffix, source, target);
+ continue;
+ }
+
+ r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(m->mount_fd), t, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0 && ret == 0)
+ ret = r;
+ }
+
+ return ret;
+}
+
+int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode) {
+ assert(st);
+ assert(dest);
+
+ if (S_ISDIR(st->st_mode))
+ return mkdir_label(dest, mode);
+ else
+ return RET_NERRNO(mknod(dest, S_IFREG|(mode & ~0111), 0));
+}
+
+int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t mode) {
+ struct stat st;
+
+ assert(source);
+ assert(dest);
+
+ if (stat(source, &st) < 0)
+ return -errno;
+
+ return make_mount_point_inode_from_stat(&st, dest, mode);
+}
+
+int trigger_automount_at(int dir_fd, const char *path) {
+ _cleanup_free_ char *nested = NULL;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+
+ nested = path_join(path, "a");
+ if (!nested)
+ return -ENOMEM;
+
+ (void) faccessat(dir_fd, nested, F_OK, 0);
+
+ return 0;
+}
+
+unsigned long credentials_fs_mount_flags(bool ro) {
+ /* A tight set of mount flags for credentials mounts */
+ return MS_NODEV|MS_NOEXEC|MS_NOSUID|ms_nosymfollow_supported()|(ro ? MS_RDONLY : 0);
+}
+
+int mount_credentials_fs(const char *path, size_t size, bool ro) {
+ _cleanup_free_ char *opts = NULL;
+ int r, noswap_supported;
+
+ /* Mounts a file system we can place credentials in, i.e. with tight access modes right from the
+ * beginning, and ideally swapping turned off. In order of preference:
+ *
+ * 1. tmpfs if it supports "noswap"
+ * 2. ramfs
+ * 3. tmpfs if it doesn't support "noswap"
+ */
+
+ noswap_supported = mount_option_supported("tmpfs", "noswap", NULL); /* Check explicitly to avoid kmsg noise */
+ if (noswap_supported > 0) {
+ _cleanup_free_ char *noswap_opts = NULL;
+
+ if (asprintf(&noswap_opts, "mode=0700,nr_inodes=1024,size=%zu,noswap", size) < 0)
+ return -ENOMEM;
+
+ /* Best case: tmpfs with noswap (needs kernel >= 6.3) */
+
+ r = mount_nofollow_verbose(
+ LOG_DEBUG,
+ "tmpfs",
+ path,
+ "tmpfs",
+ credentials_fs_mount_flags(ro),
+ noswap_opts);
+ if (r >= 0)
+ return r;
+ }
+
+ r = mount_nofollow_verbose(
+ LOG_DEBUG,
+ "ramfs",
+ path,
+ "ramfs",
+ credentials_fs_mount_flags(ro),
+ "mode=0700");
+ if (r >= 0)
+ return r;
+
+ if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", size) < 0)
+ return -ENOMEM;
+
+ return mount_nofollow_verbose(
+ LOG_DEBUG,
+ "tmpfs",
+ path,
+ "tmpfs",
+ credentials_fs_mount_flags(ro),
+ opts);
+}
+
+int make_fsmount(
+ int error_log_level,
+ const char *what,
+ const char *type,
+ unsigned long flags,
+ const char *options,
+ int userns_fd) {
+
+ _cleanup_close_ int fs_fd = -EBADF, mnt_fd = -EBADF;
+ _cleanup_free_ char *o = NULL;
+ unsigned long f;
+ int r;
+
+ assert(type);
+ assert(what);
+
+ r = mount_option_mangle(options, flags, &f, &o);
+ if (r < 0)
+ return log_full_errno(
+ error_log_level, r, "Failed to mangle mount options %s: %m",
+ strempty(options));
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *fl = NULL;
+ (void) mount_flags_to_string(f, &fl);
+
+ log_debug("Creating mount fd for %s (%s) (%s \"%s\")...",
+ strna(what), strna(type), strnull(fl), strempty(o));
+ }
+
+ fs_fd = fsopen(type, FSOPEN_CLOEXEC);
+ if (fs_fd < 0)
+ return log_full_errno(error_log_level, errno, "Failed to open superblock for \"%s\": %m", type);
+
+ if (fsconfig(fs_fd, FSCONFIG_SET_STRING, "source", what, 0) < 0)
+ return log_full_errno(error_log_level, errno, "Failed to set mount source for \"%s\" to \"%s\": %m", type, what);
+
+ if (FLAGS_SET(f, MS_RDONLY))
+ if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, "ro", NULL, 0) < 0)
+ return log_full_errno(error_log_level, errno, "Failed to set read only mount flag for \"%s\": %m", type);
+
+ for (const char *p = o;;) {
+ _cleanup_free_ char *word = NULL;
+ char *eq;
+
+ r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
+ if (r < 0)
+ return log_full_errno(error_log_level, r, "Failed to parse mount option string \"%s\": %m", o);
+ if (r == 0)
+ break;
+
+ eq = strchr(word, '=');
+ if (eq) {
+ *eq = 0;
+ eq++;
+
+ if (fsconfig(fs_fd, FSCONFIG_SET_STRING, word, eq, 0) < 0)
+ return log_full_errno(error_log_level, errno, "Failed to set mount option \"%s=%s\" for \"%s\": %m", word, eq, type);
+ } else {
+ if (fsconfig(fs_fd, FSCONFIG_SET_FLAG, word, NULL, 0) < 0)
+ return log_full_errno(error_log_level, errno, "Failed to set mount flag \"%s\" for \"%s\": %m", word, type);
+ }
+ }
+
+ if (fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0)
+ return log_full_errno(error_log_level, errno, "Failed to realize fs fd for \"%s\" (\"%s\"): %m", what, type);
+
+ mnt_fd = fsmount(fs_fd, FSMOUNT_CLOEXEC, 0);
+ if (mnt_fd < 0)
+ return log_full_errno(error_log_level, errno, "Failed to create mount fd for \"%s\" (\"%s\"): %m", what, type);
+
+ if (mount_setattr(mnt_fd, "", AT_EMPTY_PATH|AT_RECURSIVE,
+ &(struct mount_attr) {
+ .attr_set = ms_flags_to_mount_attr(f) | (userns_fd >= 0 ? MOUNT_ATTR_IDMAP : 0),
+ .userns_fd = userns_fd,
+ }, MOUNT_ATTR_SIZE_VER0) < 0)
+ return log_full_errno(error_log_level,
+ errno,
+ "Failed to set mount flags for \"%s\" (\"%s\"): %m",
+ what,
+ type);
+
+ return TAKE_FD(mnt_fd);
+}
diff --git a/src/shared/mount-util.h b/src/shared/mount-util.h
new file mode 100644
index 0000000..ef31104
--- /dev/null
+++ b/src/shared/mount-util.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <mntent.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "dissect-image.h"
+#include "errno-util.h"
+#include "macro.h"
+#include "pidref.h"
+
+int repeat_unmount(const char *path, int flags);
+
+int umount_recursive_full(const char *target, int flags, char **keep);
+
+static inline int umount_recursive(const char *target, int flags) {
+ return umount_recursive_full(target, flags, NULL);
+}
+
+int bind_remount_recursive_with_mountinfo(const char *prefix, unsigned long new_flags, unsigned long flags_mask, char **deny_list, FILE *proc_self_mountinfo);
+static inline int bind_remount_recursive(const char *prefix, unsigned long new_flags, unsigned long flags_mask, char **deny_list) {
+ return bind_remount_recursive_with_mountinfo(prefix, new_flags, flags_mask, deny_list, NULL);
+}
+
+int bind_remount_one_with_mountinfo(const char *path, unsigned long new_flags, unsigned long flags_mask, FILE *proc_self_mountinfo);
+
+int mount_switch_root_full(const char *path, unsigned long mount_propagation_flag, bool force_ms_move);
+static inline int mount_switch_root(const char *path, unsigned long mount_propagation_flag) {
+ return mount_switch_root_full(path, mount_propagation_flag, false);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(FILE*, endmntent, NULL);
+#define _cleanup_endmntent_ _cleanup_(endmntentp)
+
+int mount_verbose_full(
+ int error_log_level,
+ const char *what,
+ const char *where,
+ const char *type,
+ unsigned long flags,
+ const char *options,
+ bool follow_symlink);
+
+static inline int mount_follow_verbose(
+ int error_log_level,
+ const char *what,
+ const char *where,
+ const char *type,
+ unsigned long flags,
+ const char *options) {
+ return mount_verbose_full(error_log_level, what, where, type, flags, options, true);
+}
+
+static inline int mount_nofollow_verbose(
+ int error_log_level,
+ const char *what,
+ const char *where,
+ const char *type,
+ unsigned long flags,
+ const char *options) {
+ return mount_verbose_full(error_log_level, what, where, type, flags, options, false);
+}
+
+int umount_verbose(
+ int error_log_level,
+ const char *where,
+ int flags);
+
+int mount_exchange_graceful(int fsmount_fd, const char *dest, bool mount_beneath);
+
+int mount_option_mangle(
+ const char *options,
+ unsigned long mount_flags,
+ unsigned long *ret_mount_flags,
+ char **ret_remaining_options);
+
+int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest);
+int mount_flags_to_string(unsigned long flags, char **ret);
+
+/* Useful for usage with _cleanup_(), unmounts, removes a directory and frees the pointer */
+static inline char* umount_and_rmdir_and_free(char *p) {
+ PROTECT_ERRNO;
+ if (p) {
+ (void) umount_recursive(p, 0);
+ (void) rmdir(p);
+ }
+ return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_rmdir_and_free);
+
+static inline char *umount_and_free(char *p) {
+ PROTECT_ERRNO;
+ if (p)
+ (void) umount_recursive(p, 0);
+ return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, umount_and_free);
+
+int bind_mount_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory);
+int mount_image_in_namespace(PidRef *target, const char *propagate_path, const char *incoming_path, const char *src, const char *dest, bool read_only, bool make_file_or_directory, const MountOptions *options, const ImagePolicy *image_policy);
+
+int make_mount_point(const char *path);
+int fd_make_mount_point(int fd);
+
+typedef enum RemountIdmapping {
+ REMOUNT_IDMAPPING_NONE,
+ /* Include a mapping from UID_MAPPED_ROOT (i.e. UID 2^31-2) on the backing fs to UID 0 on the
+ * uidmapped fs. This is useful to ensure that the host root user can safely add inodes to the
+ * uidmapped fs (which otherwise wouldn't work as the host root user is not defined on the uidmapped
+ * mount and any attempts to create inodes will then be refused with EOVERFLOW). The idea is that
+ * these inodes are quickly re-chown()ed to more suitable UIDs/GIDs. Any code that intends to be able
+ * to add inodes to file systems mapped this way should set this flag, but given it comes with
+ * certain security implications defaults to off, and requires explicit opt-in. */
+ REMOUNT_IDMAPPING_HOST_ROOT,
+ /* Define a mapping from root user within the container to the owner of the bind mounted directory.
+ * This ensure no root-owned files will be written in a bind-mounted directory owned by a different
+ * user. No other users are mapped. */
+ REMOUNT_IDMAPPING_HOST_OWNER,
+ _REMOUNT_IDMAPPING_MAX,
+ _REMOUNT_IDMAPPING_INVALID = -EINVAL,
+} RemountIdmapping;
+
+int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping);
+int remount_idmap_fd(char **p, int userns_fd);
+int remount_idmap(char **p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping);
+
+int bind_mount_submounts(
+ const char *source,
+ const char *target);
+
+/* Creates a mount point (not parents) based on the source path or stat - ie, a file or a directory */
+int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode);
+int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t mode);
+
+int trigger_automount_at(int dir_fd, const char *path);
+
+unsigned long credentials_fs_mount_flags(bool ro);
+int mount_credentials_fs(const char *path, size_t size, bool ro);
+
+int make_fsmount(int error_log_level, const char *what, const char *type, unsigned long flags, const char *options, int userns_fd);
diff --git a/src/shared/net-condition.c b/src/shared/net-condition.c
new file mode 100644
index 0000000..d8b0fef
--- /dev/null
+++ b/src/shared/net-condition.c
@@ -0,0 +1,399 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <netinet/ether.h>
+
+#include "condition.h"
+#include "env-util.h"
+#include "log.h"
+#include "net-condition.h"
+#include "netif-util.h"
+#include "network-util.h"
+#include "socket-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "wifi-util.h"
+
+void net_match_clear(NetMatch *match) {
+ if (!match)
+ return;
+
+ match->hw_addr = set_free(match->hw_addr);
+ match->permanent_hw_addr = set_free(match->permanent_hw_addr);
+ match->path = strv_free(match->path);
+ match->driver = strv_free(match->driver);
+ match->iftype = strv_free(match->iftype);
+ match->kind = strv_free(match->kind);
+ match->ifname = strv_free(match->ifname);
+ match->property = strv_free(match->property);
+ match->wlan_iftype = strv_free(match->wlan_iftype);
+ match->ssid = strv_free(match->ssid);
+ match->bssid = set_free(match->bssid);
+}
+
+bool net_match_is_empty(const NetMatch *match) {
+ assert(match);
+
+ return
+ set_isempty(match->hw_addr) &&
+ set_isempty(match->permanent_hw_addr) &&
+ strv_isempty(match->path) &&
+ strv_isempty(match->driver) &&
+ strv_isempty(match->iftype) &&
+ strv_isempty(match->kind) &&
+ strv_isempty(match->ifname) &&
+ strv_isempty(match->property) &&
+ strv_isempty(match->wlan_iftype) &&
+ strv_isempty(match->ssid) &&
+ set_isempty(match->bssid);
+}
+
+static bool net_condition_test_strv(char * const *patterns, const char *string) {
+ bool match = false, has_positive_rule = false;
+
+ if (strv_isempty(patterns))
+ return true;
+
+ STRV_FOREACH(p, patterns) {
+ const char *q = *p;
+ bool invert;
+
+ invert = *q == '!';
+ q += invert;
+
+ if (!invert)
+ has_positive_rule = true;
+
+ if (string && fnmatch(q, string, 0) == 0) {
+ if (invert)
+ return false;
+ else
+ match = true;
+ }
+ }
+
+ return has_positive_rule ? match : true;
+}
+
+static bool net_condition_test_ifname(char * const *patterns, const char *ifname, char * const *alternative_names) {
+ if (net_condition_test_strv(patterns, ifname))
+ return true;
+
+ STRV_FOREACH(p, alternative_names)
+ if (net_condition_test_strv(patterns, *p))
+ return true;
+
+ return false;
+}
+
+static int net_condition_test_property(char * const *match_property, sd_device *device) {
+ if (strv_isempty(match_property))
+ return true;
+
+ STRV_FOREACH(p, match_property) {
+ _cleanup_free_ char *key = NULL;
+ const char *val, *dev_val;
+ bool invert, v;
+
+ invert = **p == '!';
+
+ val = strchr(*p + invert, '=');
+ if (!val)
+ return -EINVAL;
+
+ key = strndup(*p + invert, val - *p - invert);
+ if (!key)
+ return -ENOMEM;
+
+ val++;
+
+ v = device &&
+ sd_device_get_property_value(device, key, &dev_val) >= 0 &&
+ fnmatch(val, dev_val, 0) == 0;
+
+ if (invert ? v : !v)
+ return false;
+ }
+
+ return true;
+}
+
+int net_match_config(
+ const NetMatch *match,
+ sd_device *device,
+ const struct hw_addr_data *hw_addr,
+ const struct hw_addr_data *permanent_hw_addr,
+ const char *driver,
+ unsigned short iftype,
+ const char *kind,
+ const char *ifname,
+ char * const *alternative_names,
+ enum nl80211_iftype wlan_iftype,
+ const char *ssid,
+ const struct ether_addr *bssid) {
+
+ _cleanup_free_ char *iftype_str = NULL;
+ const char *path = NULL;
+
+ assert(match);
+
+ if (net_get_type_string(device, iftype, &iftype_str) == -ENOMEM)
+ return -ENOMEM;
+
+ if (device)
+ (void) sd_device_get_property_value(device, "ID_PATH", &path);
+
+ if (match->hw_addr && (!hw_addr || !set_contains(match->hw_addr, hw_addr)))
+ return false;
+
+ if (match->permanent_hw_addr &&
+ (!permanent_hw_addr ||
+ !set_contains(match->permanent_hw_addr, permanent_hw_addr)))
+ return false;
+
+ if (!net_condition_test_strv(match->path, path))
+ return false;
+
+ if (!net_condition_test_strv(match->driver, driver))
+ return false;
+
+ if (!net_condition_test_strv(match->iftype, iftype_str))
+ return false;
+
+ if (!net_condition_test_strv(match->kind, kind))
+ return false;
+
+ if (!net_condition_test_ifname(match->ifname, ifname, alternative_names))
+ return false;
+
+ if (!net_condition_test_property(match->property, device))
+ return false;
+
+ if (!net_condition_test_strv(match->wlan_iftype, nl80211_iftype_to_string(wlan_iftype)))
+ return false;
+
+ if (!net_condition_test_strv(match->ssid, ssid))
+ return false;
+
+ if (match->bssid && (!bssid || !set_contains(match->bssid, bssid)))
+ return false;
+
+ return true;
+}
+
+int config_parse_net_condition(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ ConditionType cond = ltype;
+ Condition **list = data, *c;
+ bool negate;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+ assert(data);
+
+ if (isempty(rvalue)) {
+ *list = condition_free_list_type(*list, cond);
+ return 0;
+ }
+
+ negate = rvalue[0] == '!';
+ if (negate)
+ rvalue++;
+
+ c = condition_new(cond, rvalue, false, negate);
+ if (!c)
+ return log_oom();
+
+ /* Drop previous assignment. */
+ *list = condition_free_list_type(*list, cond);
+
+ LIST_PREPEND(conditions, *list, c);
+ return 0;
+}
+
+int config_parse_match_strv(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const char *p = ASSERT_PTR(rvalue);
+ char ***sv = ASSERT_PTR(data);
+ bool invert;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ invert = *p == '!';
+ p += invert;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (invert) {
+ k = strjoin("!", word);
+ if (!k)
+ return log_oom();
+ } else
+ k = TAKE_PTR(word);
+
+ r = strv_consume(sv, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_match_ifnames(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const char *p = ASSERT_PTR(rvalue);
+ char ***sv = ASSERT_PTR(data);
+ bool invert;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ invert = *p == '!';
+ p += invert;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Failed to parse interface name list, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (!ifname_valid_full(word, ltype)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Interface name is not valid or too long, ignoring assignment: %s", word);
+ continue;
+ }
+
+ if (invert) {
+ k = strjoin("!", word);
+ if (!k)
+ return log_oom();
+ } else
+ k = TAKE_PTR(word);
+
+ r = strv_consume(sv, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+}
+
+int config_parse_match_property(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ const char *p = ASSERT_PTR(rvalue);
+ char ***sv = ASSERT_PTR(data);
+ bool invert;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+
+ if (isempty(rvalue)) {
+ *sv = strv_free(*sv);
+ return 0;
+ }
+
+ invert = *p == '!';
+ p += invert;
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL, *k = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE);
+ if (r == 0)
+ return 0;
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid syntax, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ if (!env_assignment_is_valid(word)) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid property or value, ignoring assignment: %s", word);
+ continue;
+ }
+
+ if (invert) {
+ k = strjoin("!", word);
+ if (!k)
+ return log_oom();
+ } else
+ k = TAKE_PTR(word);
+
+ r = strv_consume(sv, TAKE_PTR(k));
+ if (r < 0)
+ return log_oom();
+ }
+}
diff --git a/src/shared/net-condition.h b/src/shared/net-condition.h
new file mode 100644
index 0000000..0884d43
--- /dev/null
+++ b/src/shared/net-condition.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <linux/nl80211.h>
+#include <stdbool.h>
+
+#include "sd-device.h"
+
+#include "conf-parser.h"
+#include "ether-addr-util.h"
+#include "set.h"
+
+typedef struct NetMatch {
+ Set *hw_addr;
+ Set *permanent_hw_addr;
+ char **path;
+ char **driver;
+ char **iftype; /* udev's DEVTYPE field or ARPHRD_XXX, e.g. ether, wlan. */
+ char **kind; /* IFLA_INFO_KIND attribute, e.g. gre, gretap, erspan. */
+ char **ifname;
+ char **property;
+ char **wlan_iftype;
+ char **ssid;
+ Set *bssid;
+} NetMatch;
+
+void net_match_clear(NetMatch *match);
+bool net_match_is_empty(const NetMatch *match);
+
+int net_match_config(
+ const NetMatch *match,
+ sd_device *device,
+ const struct hw_addr_data *hw_addr,
+ const struct hw_addr_data *permanent_hw_addr,
+ const char *driver,
+ unsigned short iftype,
+ const char *kind,
+ const char *ifname,
+ char * const *alternative_names,
+ enum nl80211_iftype wlan_iftype,
+ const char *ssid,
+ const struct ether_addr *bssid);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_net_condition);
+CONFIG_PARSER_PROTOTYPE(config_parse_match_strv);
+CONFIG_PARSER_PROTOTYPE(config_parse_match_ifnames);
+CONFIG_PARSER_PROTOTYPE(config_parse_match_property);
diff --git a/src/shared/netif-naming-scheme.c b/src/shared/netif-naming-scheme.c
new file mode 100644
index 0000000..fbaf5c5
--- /dev/null
+++ b/src/shared/netif-naming-scheme.c
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "netif-naming-scheme.h"
+#include "proc-cmdline.h"
+#include "string-util.h"
+#include "string-table.h"
+
+static const NamingScheme naming_schemes[] = {
+ { "v238", NAMING_V238 },
+ { "v239", NAMING_V239 },
+ { "v240", NAMING_V240 },
+ { "v241", NAMING_V241 },
+ { "v243", NAMING_V243 },
+ { "v245", NAMING_V245 },
+ { "v247", NAMING_V247 },
+ { "v249", NAMING_V249 },
+ { "v250", NAMING_V250 },
+ { "v251", NAMING_V251 },
+ { "v252", NAMING_V252 },
+ { "v253", NAMING_V253 },
+ { "v254", NAMING_V254 },
+ { "v255", NAMING_V255 },
+ /* … add more schemes here, as the logic to name devices is updated … */
+
+ EXTRA_NET_NAMING_MAP
+};
+
+const NamingScheme* naming_scheme_from_name(const char *name) {
+ /* "latest" may either be defined explicitly by the extra map, in which case we will find it in
+ * the table like any other name. After iterating through the table, we check for "latest" again,
+ * which means that if not mapped explicitly, it maps to the last defined entry, whatever that is. */
+
+ for (size_t i = 0; i < ELEMENTSOF(naming_schemes); i++)
+ if (streq(naming_schemes[i].name, name))
+ return naming_schemes + i;
+
+ if (streq(name, "latest"))
+ return naming_schemes + ELEMENTSOF(naming_schemes) - 1;
+
+ return NULL;
+}
+
+const NamingScheme* naming_scheme(void) {
+ static const NamingScheme *cache = NULL;
+ _cleanup_free_ char *buffer = NULL;
+ const char *e, *k;
+
+ if (cache)
+ return cache;
+
+ /* Acquire setting from the kernel command line */
+ (void) proc_cmdline_get_key("net.naming-scheme", 0, &buffer);
+
+ /* Also acquire it from an env var */
+ e = getenv("NET_NAMING_SCHEME");
+ if (e) {
+ if (*e == ':') {
+ /* If prefixed with ':' the kernel cmdline takes precedence */
+ k = buffer ?: e + 1;
+ } else
+ k = e; /* Otherwise the env var takes precedence */
+ } else
+ k = buffer;
+
+ if (k) {
+ cache = naming_scheme_from_name(k);
+ if (cache) {
+ log_info("Using interface naming scheme '%s'.", cache->name);
+ return cache;
+ }
+
+ log_warning("Unknown interface naming scheme '%s' requested, ignoring.", k);
+ }
+
+ cache = naming_scheme_from_name(DEFAULT_NET_NAMING_SCHEME);
+ assert(cache);
+ log_info("Using default interface naming scheme '%s'.", cache->name);
+
+ return cache;
+}
+
+static const char* const name_policy_table[_NAMEPOLICY_MAX] = {
+ [NAMEPOLICY_KERNEL] = "kernel",
+ [NAMEPOLICY_KEEP] = "keep",
+ [NAMEPOLICY_DATABASE] = "database",
+ [NAMEPOLICY_ONBOARD] = "onboard",
+ [NAMEPOLICY_SLOT] = "slot",
+ [NAMEPOLICY_PATH] = "path",
+ [NAMEPOLICY_MAC] = "mac",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(name_policy, NamePolicy);
+
+static const char* const alternative_names_policy_table[_NAMEPOLICY_MAX] = {
+ [NAMEPOLICY_DATABASE] = "database",
+ [NAMEPOLICY_ONBOARD] = "onboard",
+ [NAMEPOLICY_SLOT] = "slot",
+ [NAMEPOLICY_PATH] = "path",
+ [NAMEPOLICY_MAC] = "mac",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(alternative_names_policy, NamePolicy);
diff --git a/src/shared/netif-naming-scheme.h b/src/shared/netif-naming-scheme.h
new file mode 100644
index 0000000..3f7be08
--- /dev/null
+++ b/src/shared/netif-naming-scheme.h
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+/* So here's the deal: net_id is supposed to be an exercise in providing stable names for network devices. However, we
+ * also want to keep updating the naming scheme used in future versions of net_id. These two goals of course are
+ * contradictory: on one hand we want things to not change and on the other hand we want them to improve. Our way out
+ * of this dilemma is to introduce the "naming scheme" concept: each time we improve the naming logic we define a new
+ * flag for it. Then, we keep a list of schemes, each identified by a name associated with the flags it implements. Via
+ * a kernel command line and environment variable we then allow the user to pick the scheme they want us to follow:
+ * installers could "freeze" the used scheme at the moment of installation this way.
+ *
+ * Developers: each time you tweak the naming logic here, define a new flag below, and condition the tweak with
+ * it. Each time we do a release we'll then add a new scheme entry and include all newly defined flags.
+ *
+ * Note that this is only half a solution to the problem though: not only udev/net_id gets updated all the time, the
+ * kernel gets too. And thus a kernel that previously didn't expose some sysfs attribute we look for might eventually
+ * do, and thus affect our naming scheme too. Thus, enforcing a naming scheme will make interfacing more stable across
+ * OS versions, but not fully stabilize them. */
+typedef enum NamingSchemeFlags {
+ /* First, the individual features */
+ NAMING_SR_IOV_V = 1 << 0, /* Use "v" suffix for SR-IOV, see 609948c7043a */
+ NAMING_NPAR_ARI = 1 << 1, /* Use NPAR "ARI", see 6bc04997b6ea */
+ NAMING_INFINIBAND = 1 << 2, /* Use "ib" prefix for infiniband, see 938d30aa98df */
+ NAMING_ZERO_ACPI_INDEX = 1 << 3, /* Use zero acpi_index field, see d81186ef4f6a */
+ NAMING_ALLOW_RERENAMES = 1 << 4, /* Allow re-renaming of devices, see #9006 */
+ NAMING_STABLE_VIRTUAL_MACS = 1 << 5, /* Use device name to generate MAC, see 6d3646406560 */
+ NAMING_NETDEVSIM = 1 << 6, /* Generate names for netdevsim devices, see eaa9d507d855 */
+ NAMING_LABEL_NOPREFIX = 1 << 7, /* Don't prepend ID_NET_LABEL_ONBOARD with interface type prefix */
+ NAMING_NSPAWN_LONG_HASH = 1 << 8, /* Shorten nspawn interfaces by including 24bit hash, instead of simple truncation */
+ NAMING_BRIDGE_NO_SLOT = 1 << 9, /* Don't use PCI hotplug slot information if the corresponding device is a PCI bridge */
+ NAMING_SLOT_FUNCTION_ID = 1 << 10, /* Use function_id if present to identify PCI hotplug slots */
+ NAMING_16BIT_INDEX = 1 << 11, /* Allow full 16-bit for the onboard index */
+ NAMING_REPLACE_STRICTLY = 1 << 12, /* Use udev_replace_ifname() for NAME= rule */
+ NAMING_XEN_VIF = 1 << 13, /* Generate names for Xen netfront devices */
+ NAMING_BRIDGE_MULTIFUNCTION_SLOT = 1 << 14, /* Use PCI hotplug slot information associated with bridge, but only if PCI device is multifunction.
+ * This is disabled since v255, as it seems not to work at least for some setups. See issue #28929. */
+ NAMING_DEVICETREE_ALIASES = 1 << 15, /* Generate names from devicetree aliases */
+ NAMING_USB_HOST = 1 << 16, /* Generate names for usb host */
+ NAMING_SR_IOV_R = 1 << 17, /* Use "r" suffix for SR-IOV VF representors */
+
+ /* And now the masks that combine the features above */
+ NAMING_V238 = 0,
+ NAMING_V239 = NAMING_V238 | NAMING_SR_IOV_V | NAMING_NPAR_ARI,
+ NAMING_V240 = NAMING_V239 | NAMING_INFINIBAND | NAMING_ZERO_ACPI_INDEX | NAMING_ALLOW_RERENAMES,
+ NAMING_V241 = NAMING_V240 | NAMING_STABLE_VIRTUAL_MACS,
+ NAMING_V243 = NAMING_V241 | NAMING_NETDEVSIM | NAMING_LABEL_NOPREFIX,
+ NAMING_V245 = NAMING_V243 | NAMING_NSPAWN_LONG_HASH,
+ NAMING_V247 = NAMING_V245 | NAMING_BRIDGE_NO_SLOT,
+ NAMING_V249 = NAMING_V247 | NAMING_SLOT_FUNCTION_ID | NAMING_16BIT_INDEX | NAMING_REPLACE_STRICTLY,
+ NAMING_V250 = NAMING_V249 | NAMING_XEN_VIF,
+ NAMING_V251 = NAMING_V250 | NAMING_BRIDGE_MULTIFUNCTION_SLOT,
+ NAMING_V252 = NAMING_V251 | NAMING_DEVICETREE_ALIASES,
+ NAMING_V253 = NAMING_V252 | NAMING_USB_HOST,
+ NAMING_V254 = NAMING_V253 | NAMING_SR_IOV_R, /* Despite the name, "v254" is NOT the default scheme
+ * for systemd version 254. It was added in a follow-up
+ * patch later. NAMING_SR_IOV_R is enabled by default in
+ * systemd version 255, naming scheme "v255". */
+ NAMING_V255 = NAMING_V254 & ~NAMING_BRIDGE_MULTIFUNCTION_SLOT,
+
+ EXTRA_NET_NAMING_SCHEMES
+
+ _NAMING_SCHEME_FLAGS_INVALID = -EINVAL,
+} NamingSchemeFlags;
+
+typedef struct NamingScheme {
+ const char *name;
+ NamingSchemeFlags flags;
+} NamingScheme;
+
+const NamingScheme* naming_scheme_from_name(const char *name);
+const NamingScheme* naming_scheme(void);
+
+static inline bool naming_scheme_has(NamingSchemeFlags flags) {
+ return FLAGS_SET(naming_scheme()->flags, flags);
+}
+
+typedef enum NamePolicy {
+ NAMEPOLICY_KERNEL,
+ NAMEPOLICY_KEEP,
+ NAMEPOLICY_DATABASE,
+ NAMEPOLICY_ONBOARD,
+ NAMEPOLICY_SLOT,
+ NAMEPOLICY_PATH,
+ NAMEPOLICY_MAC,
+ _NAMEPOLICY_MAX,
+ _NAMEPOLICY_INVALID = -EINVAL,
+} NamePolicy;
+
+const char *name_policy_to_string(NamePolicy p) _const_;
+NamePolicy name_policy_from_string(const char *p) _pure_;
+
+const char *alternative_names_policy_to_string(NamePolicy p) _const_;
+NamePolicy alternative_names_policy_from_string(const char *p) _pure_;
diff --git a/src/shared/netif-sriov.c b/src/shared/netif-sriov.c
new file mode 100644
index 0000000..7559b0d
--- /dev/null
+++ b/src/shared/netif-sriov.c
@@ -0,0 +1,643 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "device-util.h"
+#include "netlink-util.h"
+#include "netif-sriov.h"
+#include "parse-util.h"
+#include "set.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+static int sr_iov_new(SRIOV **ret) {
+ SRIOV *sr_iov;
+
+ assert(ret);
+
+ sr_iov = new(SRIOV, 1);
+ if (!sr_iov)
+ return -ENOMEM;
+
+ *sr_iov = (SRIOV) {
+ .vf = UINT32_MAX,
+ .vlan_proto = ETH_P_8021Q,
+ .vf_spoof_check_setting = -1,
+ .trust = -1,
+ .query_rss = -1,
+ .link_state = _SR_IOV_LINK_STATE_INVALID,
+ };
+
+ *ret = TAKE_PTR(sr_iov);
+
+ return 0;
+}
+
+static int sr_iov_new_static(OrderedHashmap **sr_iov_by_section, const char *filename, unsigned section_line, SRIOV **ret) {
+ _cleanup_(config_section_freep) ConfigSection *n = NULL;
+ _cleanup_(sr_iov_freep) SRIOV *sr_iov = NULL;
+ SRIOV *existing = NULL;
+ int r;
+
+ assert(sr_iov_by_section);
+ assert(filename);
+ assert(section_line > 0);
+ assert(ret);
+
+ r = config_section_new(filename, section_line, &n);
+ if (r < 0)
+ return r;
+
+ existing = ordered_hashmap_get(*sr_iov_by_section, n);
+ if (existing) {
+ *ret = existing;
+ return 0;
+ }
+
+ r = sr_iov_new(&sr_iov);
+ if (r < 0)
+ return r;
+
+ r = ordered_hashmap_ensure_put(sr_iov_by_section, &config_section_hash_ops, n, sr_iov);
+ if (r < 0)
+ return r;
+
+ sr_iov->section = TAKE_PTR(n);
+ sr_iov->sr_iov_by_section = *sr_iov_by_section;
+
+ *ret = TAKE_PTR(sr_iov);
+ return 0;
+}
+
+SRIOV *sr_iov_free(SRIOV *sr_iov) {
+ if (!sr_iov)
+ return NULL;
+
+ if (sr_iov->sr_iov_by_section && sr_iov->section)
+ ordered_hashmap_remove(sr_iov->sr_iov_by_section, sr_iov->section);
+
+ config_section_free(sr_iov->section);
+
+ return mfree(sr_iov);
+}
+
+void sr_iov_hash_func(const SRIOV *sr_iov, struct siphash *state) {
+ assert(sr_iov);
+ assert(state);
+
+ siphash24_compress(&sr_iov->vf, sizeof(sr_iov->vf), state);
+}
+
+int sr_iov_compare_func(const SRIOV *s1, const SRIOV *s2) {
+ assert(s1);
+ assert(s2);
+
+ return CMP(s1->vf, s2->vf);
+}
+
+DEFINE_PRIVATE_HASH_OPS(
+ sr_iov_hash_ops,
+ SRIOV,
+ sr_iov_hash_func,
+ sr_iov_compare_func);
+
+int sr_iov_set_netlink_message(SRIOV *sr_iov, sd_netlink_message *req) {
+ int r;
+
+ assert(sr_iov);
+ assert(req);
+
+ r = sd_netlink_message_open_container(req, IFLA_VFINFO_LIST);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_open_container(req, IFLA_VF_INFO);
+ if (r < 0)
+ return r;
+
+ if (!ether_addr_is_null(&sr_iov->mac)) {
+ struct ifla_vf_mac ivm = {
+ .vf = sr_iov->vf,
+ };
+
+ memcpy(ivm.mac, &sr_iov->mac, ETH_ALEN);
+ r = sd_netlink_message_append_data(req, IFLA_VF_MAC, &ivm, sizeof(struct ifla_vf_mac));
+ if (r < 0)
+ return r;
+ }
+
+ if (sr_iov->vf_spoof_check_setting >= 0) {
+ struct ifla_vf_spoofchk ivs = {
+ .vf = sr_iov->vf,
+ .setting = sr_iov->vf_spoof_check_setting,
+ };
+
+ r = sd_netlink_message_append_data(req, IFLA_VF_SPOOFCHK, &ivs, sizeof(struct ifla_vf_spoofchk));
+ if (r < 0)
+ return r;
+ }
+
+ if (sr_iov->query_rss >= 0) {
+ struct ifla_vf_rss_query_en ivs = {
+ .vf = sr_iov->vf,
+ .setting = sr_iov->query_rss,
+ };
+
+ r = sd_netlink_message_append_data(req, IFLA_VF_RSS_QUERY_EN, &ivs, sizeof(struct ifla_vf_rss_query_en));
+ if (r < 0)
+ return r;
+ }
+
+ if (sr_iov->trust >= 0) {
+ struct ifla_vf_trust ivt = {
+ .vf = sr_iov->vf,
+ .setting = sr_iov->trust,
+ };
+
+ r = sd_netlink_message_append_data(req, IFLA_VF_TRUST, &ivt, sizeof(struct ifla_vf_trust));
+ if (r < 0)
+ return r;
+ }
+
+ if (sr_iov->link_state >= 0) {
+ struct ifla_vf_link_state ivl = {
+ .vf = sr_iov->vf,
+ .link_state = sr_iov->link_state,
+ };
+
+ r = sd_netlink_message_append_data(req, IFLA_VF_LINK_STATE, &ivl, sizeof(struct ifla_vf_link_state));
+ if (r < 0)
+ return r;
+ }
+
+ if (sr_iov->vlan > 0) {
+ /* Because of padding, first the buffer must be initialized with 0. */
+ struct ifla_vf_vlan_info ivvi = {};
+ ivvi.vf = sr_iov->vf;
+ ivvi.vlan = sr_iov->vlan;
+ ivvi.qos = sr_iov->qos;
+ ivvi.vlan_proto = htobe16(sr_iov->vlan_proto);
+
+ r = sd_netlink_message_open_container(req, IFLA_VF_VLAN_LIST);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_append_data(req, IFLA_VF_VLAN_INFO, &ivvi, sizeof(struct ifla_vf_vlan_info));
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(req);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_netlink_message_close_container(req);
+ if (r < 0)
+ return r;
+
+ r = sd_netlink_message_close_container(req);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int sr_iov_get_num_vfs(sd_device *device, uint32_t *ret) {
+ const char *str;
+ uint32_t n;
+ int r;
+
+ assert(device);
+ assert(ret);
+
+ r = sd_device_get_sysattr_value(device, "device/sriov_numvfs", &str);
+ if (r < 0)
+ return r;
+
+ r = safe_atou32(str, &n);
+ if (r < 0)
+ return r;
+
+ *ret = n;
+ return 0;
+}
+
+int sr_iov_set_num_vfs(sd_device *device, uint32_t num_vfs, OrderedHashmap *sr_iov_by_section) {
+ char val[DECIMAL_STR_MAX(uint32_t)];
+ const char *str;
+ int r;
+
+ assert(device);
+
+ if (num_vfs == UINT32_MAX) {
+ uint32_t current_num_vfs;
+ SRIOV *sr_iov;
+
+ /* If the number of virtual functions is not specified, then use the maximum number of VF + 1. */
+
+ num_vfs = 0;
+ ORDERED_HASHMAP_FOREACH(sr_iov, sr_iov_by_section)
+ num_vfs = MAX(num_vfs, sr_iov->vf + 1);
+
+ if (num_vfs == 0) /* No VF is configured. */
+ return 0;
+
+ r = sr_iov_get_num_vfs(device, &current_num_vfs);
+ if (r < 0)
+ return log_device_debug_errno(device, r, "Failed to get the current number of SR-IOV virtual functions: %m");
+
+ /* Enough VFs already exist. */
+ if (num_vfs <= current_num_vfs)
+ return 0;
+
+ } else if (num_vfs == 0) {
+ r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", "0");
+ if (r < 0)
+ log_device_debug_errno(device, r, "Failed to write device/sriov_numvfs sysfs attribute, ignoring: %m");
+
+ /* Gracefully handle the error in disabling VFs when the interface does not support SR-IOV. */
+ return r == -ENOENT ? 0 : r;
+ }
+
+ /* So, the interface does not have enough VFs. Before increasing the number of VFs, check the
+ * maximum allowed number of VFs from the sriov_totalvfs sysattr. Note that the sysattr
+ * currently exists only for PCI drivers. Hence, ignore -ENOENT.
+ * TODO: netdevsim provides the information in debugfs. */
+ r = sd_device_get_sysattr_value(device, "device/sriov_totalvfs", &str);
+ if (r >= 0) {
+ uint32_t max_num_vfs;
+
+ r = safe_atou32(str, &max_num_vfs);
+ if (r < 0)
+ return log_device_debug_errno(device, r, "Failed to parse device/sriov_totalvfs sysfs attribute '%s': %m", str);
+
+ if (num_vfs > max_num_vfs)
+ return log_device_debug_errno(device, SYNTHETIC_ERRNO(ERANGE),
+ "Specified number of virtual functions is out of range. "
+ "The maximum allowed value is %"PRIu32".",
+ max_num_vfs);
+
+ } else if (r != -ENOENT) /* Currently, only PCI driver has the attribute. */
+ return log_device_debug_errno(device, r, "Failed to read device/sriov_totalvfs sysfs attribute: %m");
+
+ xsprintf(val, "%"PRIu32, num_vfs);
+ r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", val);
+ if (r == -EBUSY) {
+ /* Some devices e.g. netdevsim refuse to set sriov_numvfs if it has non-zero value. */
+ r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", "0");
+ if (r >= 0)
+ r = sd_device_set_sysattr_value(device, "device/sriov_numvfs", val);
+ }
+ if (r < 0)
+ return log_device_debug_errno(device, r, "Failed to write device/sriov_numvfs sysfs attribute: %m");
+
+ log_device_debug(device, "device/sriov_numvfs sysfs attribute set to '%s'.", val);
+ return 0;
+}
+
+static int sr_iov_section_verify(uint32_t num_vfs, SRIOV *sr_iov) {
+ assert(sr_iov);
+
+ if (section_is_invalid(sr_iov->section))
+ return -EINVAL;
+
+ if (sr_iov->vf == UINT32_MAX)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "%s: [SR-IOV] section without VirtualFunction= field configured. "
+ "Ignoring [SR-IOV] section from line %u.",
+ sr_iov->section->filename, sr_iov->section->line);
+
+ if (sr_iov->vf >= num_vfs)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "%s: VirtualFunction= must be smaller than the value specified in SR-IOVVirtualFunctions=. "
+ "Ignoring [SR-IOV] section from line %u.",
+ sr_iov->section->filename, sr_iov->section->line);
+
+ return 0;
+}
+
+int sr_iov_drop_invalid_sections(uint32_t num_vfs, OrderedHashmap *sr_iov_by_section) {
+ _cleanup_set_free_ Set *set = NULL;
+ SRIOV *sr_iov;
+ int r;
+
+ ORDERED_HASHMAP_FOREACH(sr_iov, sr_iov_by_section) {
+ SRIOV *dup;
+
+ if (sr_iov_section_verify(num_vfs, sr_iov) < 0) {
+ sr_iov_free(sr_iov);
+ continue;
+ }
+
+ dup = set_remove(set, sr_iov);
+ if (dup) {
+ log_warning("%s: Conflicting [SR-IOV] section is specified at line %u and %u, "
+ "dropping the [SR-IOV] section specified at line %u.",
+ dup->section->filename, sr_iov->section->line,
+ dup->section->line, dup->section->line);
+ sr_iov_free(dup);
+ }
+
+ r = set_ensure_put(&set, &sr_iov_hash_ops, sr_iov);
+ if (r < 0)
+ return log_oom();
+ assert(r > 0);
+ }
+
+ return 0;
+}
+
+int config_parse_sr_iov_uint32(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL;
+ OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data);
+ uint32_t k;
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov);
+ if (r < 0)
+ return r;
+
+ if (isempty(rvalue)) {
+ if (streq(lvalue, "VirtualFunction"))
+ sr_iov->vf = UINT32_MAX;
+ else if (streq(lvalue, "VLANId"))
+ sr_iov->vlan = 0;
+ else if (streq(lvalue, "QualityOfService"))
+ sr_iov->qos = 0;
+ else
+ assert_not_reached();
+
+ TAKE_PTR(sr_iov);
+ return 0;
+ }
+
+ r = safe_atou32(rvalue, &k);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (streq(lvalue, "VLANId")) {
+ if (k == 0 || k > 4095) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid SR-IOV VLANId: %u", k);
+ return 0;
+ }
+ sr_iov->vlan = k;
+ } else if (streq(lvalue, "VirtualFunction")) {
+ if (k >= INT_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0, "Invalid SR-IOV virtual function: %u", k);
+ return 0;
+ }
+ sr_iov->vf = k;
+ } else if (streq(lvalue, "QualityOfService"))
+ sr_iov->qos = k;
+ else
+ assert_not_reached();
+
+ TAKE_PTR(sr_iov);
+ return 0;
+}
+
+int config_parse_sr_iov_vlan_proto(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL;
+ OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov);
+ if (r < 0)
+ return r;
+
+ if (isempty(rvalue) || streq(rvalue, "802.1Q"))
+ sr_iov->vlan_proto = ETH_P_8021Q;
+ else if (streq(rvalue, "802.1ad"))
+ sr_iov->vlan_proto = ETH_P_8021AD;
+ else {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "Invalid SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ TAKE_PTR(sr_iov);
+ return 0;
+}
+
+int config_parse_sr_iov_link_state(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL;
+ OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov);
+ if (r < 0)
+ return r;
+
+ /* Unfortunately, SR_IOV_LINK_STATE_DISABLE is 2, not 0. So, we cannot use
+ * DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN() macro. */
+
+ if (isempty(rvalue)) {
+ sr_iov->link_state = _SR_IOV_LINK_STATE_INVALID;
+ TAKE_PTR(sr_iov);
+ return 0;
+ }
+
+ if (streq(rvalue, "auto")) {
+ sr_iov->link_state = SR_IOV_LINK_STATE_AUTO;
+ TAKE_PTR(sr_iov);
+ return 0;
+ }
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ sr_iov->link_state = r ? SR_IOV_LINK_STATE_ENABLE : SR_IOV_LINK_STATE_DISABLE;
+ TAKE_PTR(sr_iov);
+ return 0;
+}
+
+int config_parse_sr_iov_boolean(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL;
+ OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov);
+ if (r < 0)
+ return r;
+
+ if (isempty(rvalue)) {
+ if (streq(lvalue, "MACSpoofCheck"))
+ sr_iov->vf_spoof_check_setting = -1;
+ else if (streq(lvalue, "QueryReceiveSideScaling"))
+ sr_iov->query_rss = -1;
+ else if (streq(lvalue, "Trust"))
+ sr_iov->trust = -1;
+ else
+ assert_not_reached();
+
+ TAKE_PTR(sr_iov);
+ return 0;
+ }
+
+ r = parse_boolean(rvalue);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse '%s=', ignoring: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (streq(lvalue, "MACSpoofCheck"))
+ sr_iov->vf_spoof_check_setting = r;
+ else if (streq(lvalue, "QueryReceiveSideScaling"))
+ sr_iov->query_rss = r;
+ else if (streq(lvalue, "Trust"))
+ sr_iov->trust = r;
+ else
+ assert_not_reached();
+
+ TAKE_PTR(sr_iov);
+ return 0;
+}
+
+int config_parse_sr_iov_mac(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_(sr_iov_free_or_set_invalidp) SRIOV *sr_iov = NULL;
+ OrderedHashmap **sr_iov_by_section = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = sr_iov_new_static(sr_iov_by_section, filename, section_line, &sr_iov);
+ if (r < 0)
+ return r;
+
+ if (isempty(rvalue)) {
+ sr_iov->mac = ETHER_ADDR_NULL;
+ TAKE_PTR(sr_iov);
+ return 0;
+ }
+
+ r = parse_ether_addr(rvalue, &sr_iov->mac);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse SR-IOV '%s=', ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ TAKE_PTR(sr_iov);
+ return 0;
+}
+
+int config_parse_sr_iov_num_vfs(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint32_t n, *num_vfs = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ *num_vfs = UINT32_MAX;
+ return 0;
+ }
+
+ r = safe_atou32(rvalue, &n);
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse %s=, ignoring assignment: %s", lvalue, rvalue);
+ return 0;
+ }
+
+ if (n > INT_MAX) {
+ log_syntax(unit, LOG_WARNING, filename, line, 0,
+ "The number of SR-IOV virtual functions is too large. It must be equal to "
+ "or smaller than 2147483647. Ignoring assignment: %"PRIu32, n);
+ return 0;
+ }
+
+ *num_vfs = n;
+ return 0;
+}
diff --git a/src/shared/netif-sriov.h b/src/shared/netif-sriov.h
new file mode 100644
index 0000000..ee76957
--- /dev/null
+++ b/src/shared/netif-sriov.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <linux/if_link.h>
+
+#include "sd-device.h"
+
+#include "conf-parser.h"
+#include "ether-addr-util.h"
+#include "hashmap.h"
+
+typedef enum SRIOVLinkState {
+ SR_IOV_LINK_STATE_AUTO = IFLA_VF_LINK_STATE_AUTO,
+ SR_IOV_LINK_STATE_ENABLE = IFLA_VF_LINK_STATE_ENABLE,
+ SR_IOV_LINK_STATE_DISABLE = IFLA_VF_LINK_STATE_DISABLE,
+ _SR_IOV_LINK_STATE_MAX,
+ _SR_IOV_LINK_STATE_INVALID = -EINVAL,
+} SRIOVLinkState;
+
+typedef struct SRIOV {
+ ConfigSection *section;
+ OrderedHashmap *sr_iov_by_section;
+
+ uint32_t vf; /* 0 - 2147483646 */
+ uint32_t vlan; /* 0 - 4095, 0 disables VLAN filter */
+ uint32_t qos;
+ uint16_t vlan_proto; /* ETH_P_8021Q or ETH_P_8021AD */
+ int vf_spoof_check_setting;
+ int query_rss;
+ int trust;
+ SRIOVLinkState link_state;
+ struct ether_addr mac;
+} SRIOV;
+
+SRIOV *sr_iov_free(SRIOV *sr_iov);
+void sr_iov_hash_func(const SRIOV *sr_iov, struct siphash *state);
+int sr_iov_compare_func(const SRIOV *s1, const SRIOV *s2);
+int sr_iov_set_netlink_message(SRIOV *sr_iov, sd_netlink_message *req);
+int sr_iov_get_num_vfs(sd_device *device, uint32_t *ret);
+int sr_iov_set_num_vfs(sd_device *device, uint32_t num_vfs, OrderedHashmap *sr_iov_by_section);
+int sr_iov_drop_invalid_sections(uint32_t num_vfs, OrderedHashmap *sr_iov_by_section);
+
+DEFINE_SECTION_CLEANUP_FUNCTIONS(SRIOV, sr_iov_free);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_uint32);
+CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_boolean);
+CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_link_state);
+CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_vlan_proto);
+CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_mac);
+CONFIG_PARSER_PROTOTYPE(config_parse_sr_iov_num_vfs);
diff --git a/src/shared/netif-util.c b/src/shared/netif-util.c
new file mode 100644
index 0000000..f56c564
--- /dev/null
+++ b/src/shared/netif-util.c
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <linux/if.h>
+#include <linux/if_arp.h>
+
+#include "arphrd-util.h"
+#include "device-util.h"
+#include "log-link.h"
+#include "memory-util.h"
+#include "netif-util.h"
+#include "siphash24.h"
+#include "sparse-endian.h"
+#include "strv.h"
+
+bool netif_has_carrier(uint8_t operstate, unsigned flags) {
+ /* see Documentation/networking/operstates.txt in the kernel sources */
+
+ if (operstate == IF_OPER_UP)
+ return true;
+
+ if (operstate != IF_OPER_UNKNOWN)
+ return false;
+
+ /* operstate may not be implemented, so fall back to flags */
+ return FLAGS_SET(flags, IFF_LOWER_UP | IFF_RUNNING) &&
+ !FLAGS_SET(flags, IFF_DORMANT);
+}
+
+int net_get_type_string(sd_device *device, uint16_t iftype, char **ret) {
+ const char *t;
+ char *p;
+
+ if (device &&
+ sd_device_get_devtype(device, &t) >= 0 &&
+ !isempty(t)) {
+ p = strdup(t);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return 0;
+ }
+
+ t = arphrd_to_name(iftype);
+ if (!t)
+ return -ENOENT;
+
+ p = strdup(t);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = ascii_strlower(p);
+ return 0;
+}
+
+const char *net_get_persistent_name(sd_device *device) {
+ assert(device);
+
+ /* fetch some persistent data unique (on this machine) to this device */
+ FOREACH_STRING(field, "ID_NET_NAME_ONBOARD", "ID_NET_NAME_SLOT", "ID_NET_NAME_PATH", "ID_NET_NAME_MAC") {
+ const char *name;
+
+ if (sd_device_get_property_value(device, field, &name) >= 0)
+ return name;
+ }
+
+ return NULL;
+}
+
+/* Used when generating hardware address by udev, and IPv4LL seed by networkd. */
+#define HASH_KEY SD_ID128_MAKE(d3,1e,48,fa,90,fe,4b,4c,9d,af,d5,d7,a1,b1,2e,8a)
+
+int net_get_unique_predictable_data(sd_device *device, bool use_sysname, uint64_t *ret) {
+ const char *name;
+
+ assert(device);
+ assert(ret);
+
+ /* net_get_persistent_name() will return one of the device names based on stable information about
+ * the device. If this is not available, we fall back to using the actual device name. */
+ name = net_get_persistent_name(device);
+ if (!name && use_sysname)
+ (void) sd_device_get_sysname(device, &name);
+ if (!name)
+ return log_device_debug_errno(device, SYNTHETIC_ERRNO(ENODATA),
+ "No stable identifying information found");
+
+ log_device_debug(device, "Using \"%s\" as stable identifying information", name);
+
+ return net_get_unique_predictable_data_from_name(name, &HASH_KEY, ret);
+}
+
+int net_get_unique_predictable_data_from_name(
+ const char *name,
+ const sd_id128_t *key,
+ uint64_t *ret) {
+
+ size_t l, sz;
+ uint8_t *v;
+ int r;
+
+ assert(name);
+ assert(key);
+ assert(ret);
+
+ l = strlen(name);
+ sz = sizeof(sd_id128_t) + l;
+ v = newa(uint8_t, sz);
+
+ /* Fetch some persistent data unique to this machine */
+ r = sd_id128_get_machine((sd_id128_t*) v);
+ if (r < 0)
+ return r;
+
+ memcpy(v + sizeof(sd_id128_t), name, l);
+
+ /* Let's hash the machine ID plus the device name. We use
+ * a fixed, but originally randomly created hash key here. */
+ *ret = htole64(siphash24(v, sz, key->bytes));
+ return 0;
+}
+
+typedef struct Link {
+ const char *ifname;
+} Link;
+
+int net_verify_hardware_address(
+ const char *ifname,
+ bool is_static,
+ uint16_t iftype,
+ const struct hw_addr_data *ib_hw_addr, /* current or parent HW address */
+ struct hw_addr_data *new_hw_addr) {
+
+ Link link = { .ifname = ifname };
+
+ assert(new_hw_addr);
+
+ if (new_hw_addr->length == 0)
+ return 0;
+
+ if (new_hw_addr->length != arphrd_to_hw_addr_len(iftype)) {
+ if (is_static)
+ log_link_warning(&link,
+ "Specified MAC address with invalid length (%zu, expected %zu), refusing.",
+ new_hw_addr->length, arphrd_to_hw_addr_len(iftype));
+ return -EINVAL;
+ }
+
+ switch (iftype) {
+ case ARPHRD_ETHER:
+ /* see eth_random_addr() in the kernel */
+
+ if (ether_addr_is_null(&new_hw_addr->ether)) {
+ if (is_static)
+ log_link_warning(&link, "Specified MAC address is null, refusing.");
+ return -EINVAL;
+ }
+
+ if (ether_addr_is_broadcast(&new_hw_addr->ether)) {
+ if (is_static)
+ log_link_warning(&link, "Specified MAC address is broadcast, refusing.");
+ return -EINVAL;
+ }
+
+ if (ether_addr_is_multicast(&new_hw_addr->ether)) {
+ if (is_static)
+ log_link_warning(&link, "Specified MAC address has the multicast bit set, clearing the bit.");
+
+ new_hw_addr->bytes[0] &= 0xfe;
+ }
+
+ if (!is_static && !ether_addr_is_local(&new_hw_addr->ether))
+ /* Adjust local assignment bit when the MAC address is generated randomly. */
+ new_hw_addr->bytes[0] |= 0x02;
+
+ break;
+
+ case ARPHRD_INFINIBAND:
+ /* see ipoib_check_lladdr() in the kernel */
+
+ assert(ib_hw_addr);
+ assert(ib_hw_addr->length == INFINIBAND_ALEN);
+
+ if (is_static &&
+ (!memeqzero(new_hw_addr->bytes, INFINIBAND_ALEN - 8) ||
+ memcmp(new_hw_addr->bytes, ib_hw_addr->bytes, INFINIBAND_ALEN - 8) != 0))
+ log_link_warning(&link, "Only the last 8 bytes of the InifniBand MAC address can be changed, ignoring the first 12 bytes.");
+
+ if (memeqzero(new_hw_addr->bytes + INFINIBAND_ALEN - 8, 8)) {
+ if (is_static)
+ log_link_warning(&link, "The last 8 bytes of the InfiniBand MAC address cannot be null, refusing.");
+ return -EINVAL;
+ }
+
+ memcpy(new_hw_addr->bytes, ib_hw_addr->bytes, INFINIBAND_ALEN - 8);
+ break;
+
+ default:
+ if (is_static)
+ log_link_warning(&link, "Unsupported interface type %s%u to set MAC address, refusing.",
+ strna(arphrd_to_name(iftype)), iftype);
+ return -EINVAL;
+ }
+
+ return 0;
+}
diff --git a/src/shared/netif-util.h b/src/shared/netif-util.h
new file mode 100644
index 0000000..fb6a27c
--- /dev/null
+++ b/src/shared/netif-util.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include "sd-device.h"
+#include "sd-id128.h"
+
+#include "ether-addr-util.h"
+
+bool netif_has_carrier(uint8_t operstate, unsigned flags);
+int net_get_type_string(sd_device *device, uint16_t iftype, char **ret);
+const char *net_get_persistent_name(sd_device *device);
+int net_get_unique_predictable_data(sd_device *device, bool use_sysname, uint64_t *ret);
+int net_get_unique_predictable_data_from_name(const char *name, const sd_id128_t *key, uint64_t *ret);
+int net_verify_hardware_address(
+ const char *ifname,
+ bool is_static,
+ uint16_t iftype,
+ const struct hw_addr_data *ib_hw_addr,
+ struct hw_addr_data *new_hw_addr);
diff --git a/src/shared/nscd-flush.c b/src/shared/nscd-flush.c
new file mode 100644
index 0000000..6df18d7
--- /dev/null
+++ b/src/shared/nscd-flush.c
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+#include <poll.h>
+
+#include "fd-util.h"
+#include "io-util.h"
+#include "nscd-flush.h"
+#include "socket-util.h"
+#include "strv.h"
+#include "time-util.h"
+
+#define NSCD_FLUSH_CACHE_TIMEOUT_USEC (5*USEC_PER_SEC)
+
+struct nscdInvalidateRequest {
+ int32_t version;
+ int32_t type; /* in glibc this is an enum. We don't replicate this here 1:1. Also, wtf, how unportable is that
+ * even? */
+ int32_t key_len;
+ char dbname[];
+};
+
+static int nscd_flush_cache_one(const char *database, usec_t end) {
+ size_t req_size, has_written = 0, has_read = 0, l;
+ struct nscdInvalidateRequest *req;
+ _cleanup_close_ int fd = -EBADF;
+ int32_t resp;
+ int events, r;
+
+ assert(database);
+
+ l = strlen(database);
+ req_size = offsetof(struct nscdInvalidateRequest, dbname) + l + 1;
+
+ req = alloca_safe(req_size);
+ *req = (struct nscdInvalidateRequest) {
+ .version = 2,
+ .type = 10,
+ .key_len = l + 1,
+ };
+
+ strcpy(req->dbname, database);
+
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to allocate nscd socket: %m");
+
+ /* Note: connect() returns EINPROGRESS if O_NONBLOCK is set and establishing a connection takes time. The
+ * kernel lets us know this way that the connection is now being established, and we should watch with poll()
+ * to learn when it is fully established. That said, AF_UNIX on Linux never triggers this IRL (connect() is
+ * always instant on AF_UNIX), hence handling this is mostly just an exercise in defensive, protocol-agnostic
+ * programming.
+ *
+ * connect() returns EAGAIN if the socket's backlog limit has been reached. When we see this we give up right
+ * away, after all this entire function here is written in a defensive style so that a non-responding nscd
+ * doesn't stall us for good. (Even if we wanted to handle this better: the Linux kernel doesn't really have a
+ * nice way to connect() to a server synchronously with a time limit that would also cover dealing with the
+ * backlog limit. After all SO_RCVTIMEO and SR_SNDTIMEO don't apply to connect(), and alarm() is frickin' ugly
+ * and not really reasonably usable from threads-aware code.) */
+ r = connect_unix_path(fd, AT_FDCWD, "/run/nscd/socket");
+ if (r < 0) {
+ if (r == -EAGAIN)
+ return log_debug_errno(r, "nscd is overloaded (backlog limit reached) and refuses to take further connections: %m");
+ if (r != -EINPROGRESS)
+ return log_debug_errno(r, "Failed to connect to nscd socket: %m");
+
+ /* Continue in case of EINPROGRESS, but don't bother with send() or recv() until being notified that
+ * establishing the connection is complete. */
+ events = 0;
+ } else
+ events = POLLIN|POLLOUT; /* Let's assume initially that we can write and read to the fd, to suppress
+ * one poll() invocation */
+ for (;;) {
+ usec_t p;
+
+ if (events & POLLOUT) {
+ ssize_t m;
+
+ assert(has_written < req_size);
+
+ m = send(fd, (uint8_t*) req + has_written, req_size - has_written, MSG_NOSIGNAL);
+ if (m < 0) {
+ if (errno != EAGAIN) /* Note that EAGAIN is returned by the kernel whenever it can't
+ * take the data right now, and that includes if the connect() is
+ * asynchronous and we saw EINPROGRESS on it, and it hasn't
+ * completed yet. */
+ return log_debug_errno(errno, "Failed to write to nscd socket: %m");
+ } else
+ has_written += m;
+ }
+
+ if (events & (POLLIN|POLLERR|POLLHUP)) {
+ ssize_t m;
+
+ if (has_read >= sizeof(resp))
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Response from nscd longer than expected: %m");
+
+ m = recv(fd, (uint8_t*) &resp + has_read, sizeof(resp) - has_read, 0);
+ if (m < 0) {
+ if (errno != EAGAIN)
+ return log_debug_errno(errno, "Failed to read from nscd socket: %m");
+ } else if (m == 0) { /* EOF */
+ if (has_read == 0 && has_written >= req_size) /* Older nscd immediately terminated the
+ * connection, accept that as OK */
+ return 1;
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "nscd prematurely ended connection.");
+ } else
+ has_read += m;
+ }
+
+ if (has_written >= req_size && has_read >= sizeof(resp)) { /* done? */
+ if (resp < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "nscd sent us a negative error number: %i", resp);
+ if (resp > 0)
+ return log_debug_errno(resp, "nscd return failure code on invalidating '%s'.", database);
+ return 1;
+ }
+
+ p = now(CLOCK_MONOTONIC);
+ if (p >= end)
+ return -ETIMEDOUT;
+
+ events = fd_wait_for_event(fd, POLLIN | (has_written < req_size ? POLLOUT : 0), end - p);
+ if (events < 0)
+ return events;
+ }
+}
+
+int nscd_flush_cache(char **databases) {
+ int r = 0;
+
+ /* Tries to invalidate the specified database in nscd. We do this carefully, with a 5s timeout,
+ * so that we don't block indefinitely on another service. */
+
+ usec_t end = usec_add(now(CLOCK_MONOTONIC), NSCD_FLUSH_CACHE_TIMEOUT_USEC);
+
+ STRV_FOREACH(i, databases)
+ RET_GATHER(r, nscd_flush_cache_one(*i, end));
+
+ return r;
+}
diff --git a/src/shared/nscd-flush.h b/src/shared/nscd-flush.h
new file mode 100644
index 0000000..dac223e
--- /dev/null
+++ b/src/shared/nscd-flush.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if ENABLE_NSCD
+int nscd_flush_cache(char **databases);
+#else
+static inline void nscd_flush_cache(char **databases) {}
+#endif
diff --git a/src/shared/nsflags.c b/src/shared/nsflags.c
new file mode 100644
index 0000000..d4cee06
--- /dev/null
+++ b/src/shared/nsflags.c
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "namespace-util.h"
+#include "nsflags.h"
+#include "string-util.h"
+
+int namespace_flags_from_string(const char *name, unsigned long *ret) {
+ unsigned long flags = 0;
+ int r;
+
+ assert_se(ret);
+
+ for (;;) {
+ _cleanup_free_ char *word = NULL;
+ unsigned long f = 0;
+ unsigned i;
+
+ r = extract_first_word(&name, &word, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ for (i = 0; namespace_info[i].proc_name; i++)
+ if (streq(word, namespace_info[i].proc_name)) {
+ f = namespace_info[i].clone_flag;
+ break;
+ }
+
+ if (f == 0)
+ return -EINVAL;
+
+ flags |= f;
+ }
+
+ *ret = flags;
+ return 0;
+}
+
+int namespace_flags_to_string(unsigned long flags, char **ret) {
+ _cleanup_free_ char *s = NULL;
+ unsigned i;
+
+ for (i = 0; namespace_info[i].proc_name; i++) {
+ if ((flags & namespace_info[i].clone_flag) != namespace_info[i].clone_flag)
+ continue;
+
+ if (!strextend_with_separator(&s, " ", namespace_info[i].proc_name))
+ return -ENOMEM;
+ }
+
+ *ret = TAKE_PTR(s);
+
+ return 0;
+}
+
+const char *namespace_single_flag_to_string(unsigned long flag) {
+ for (unsigned i = 0; namespace_info[i].proc_name; i++)
+ if (namespace_info[i].clone_flag == flag)
+ return namespace_info[i].proc_name;
+
+ return NULL;
+}
diff --git a/src/shared/nsflags.h b/src/shared/nsflags.h
new file mode 100644
index 0000000..b59740c
--- /dev/null
+++ b/src/shared/nsflags.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "missing_sched.h"
+
+/* The combination of all namespace flags defined by the kernel. The right type for this isn't clear. setns() and
+ * unshare() expect these flags to be passed as (signed) "int", while clone() wants them as "unsigned long". The latter
+ * is definitely more appropriate for a flags parameter, and also the larger type of the two, hence let's stick to that
+ * here. */
+#define NAMESPACE_FLAGS_ALL \
+ ((unsigned long) (CLONE_NEWCGROUP| \
+ CLONE_NEWIPC| \
+ CLONE_NEWNET| \
+ CLONE_NEWNS| \
+ CLONE_NEWPID| \
+ CLONE_NEWUSER| \
+ CLONE_NEWUTS))
+
+#define NAMESPACE_FLAGS_INITIAL ULONG_MAX
+
+int namespace_flags_from_string(const char *name, unsigned long *ret);
+int namespace_flags_to_string(unsigned long flags, char **ret);
+const char *namespace_single_flag_to_string(unsigned long flag);
diff --git a/src/shared/numa-util.c b/src/shared/numa-util.c
new file mode 100644
index 0000000..a954ea3
--- /dev/null
+++ b/src/shared/numa-util.c
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <sched.h>
+
+#include "alloc-util.h"
+#include "cpu-set-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "macro.h"
+#include "missing_syscall.h"
+#include "numa-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+
+bool numa_policy_is_valid(const NUMAPolicy *policy) {
+ assert(policy);
+
+ if (!mpol_is_valid(numa_policy_get_type(policy)))
+ return false;
+
+ if (!policy->nodes.set &&
+ !IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL, MPOL_PREFERRED))
+ return false;
+
+ if (policy->nodes.set &&
+ numa_policy_get_type(policy) == MPOL_PREFERRED &&
+ CPU_COUNT_S(policy->nodes.allocated, policy->nodes.set) != 1)
+ return false;
+
+ return true;
+}
+
+static int numa_policy_to_mempolicy(const NUMAPolicy *policy, unsigned long *ret_maxnode, unsigned long **ret_nodes) {
+ unsigned node, bits = 0, ulong_bits;
+ _cleanup_free_ unsigned long *out = NULL;
+
+ assert(policy);
+ assert(ret_maxnode);
+ assert(ret_nodes);
+
+ if (IN_SET(numa_policy_get_type(policy), MPOL_DEFAULT, MPOL_LOCAL) ||
+ (numa_policy_get_type(policy) == MPOL_PREFERRED && !policy->nodes.set)) {
+ *ret_nodes = NULL;
+ *ret_maxnode = 0;
+ return 0;
+ }
+
+ bits = policy->nodes.allocated * 8;
+ ulong_bits = sizeof(unsigned long) * 8;
+
+ out = new0(unsigned long, DIV_ROUND_UP(policy->nodes.allocated, sizeof(unsigned long)));
+ if (!out)
+ return -ENOMEM;
+
+ /* We don't make any assumptions about internal type libc is using to store NUMA node mask.
+ Hence we need to convert the node mask to the representation expected by set_mempolicy() */
+ for (node = 0; node < bits; node++)
+ if (CPU_ISSET_S(node, policy->nodes.allocated, policy->nodes.set))
+ out[node / ulong_bits] |= 1ul << (node % ulong_bits);
+
+ *ret_nodes = TAKE_PTR(out);
+ *ret_maxnode = bits + 1;
+ return 0;
+}
+
+int apply_numa_policy(const NUMAPolicy *policy) {
+ int r;
+ _cleanup_free_ unsigned long *nodes = NULL;
+ unsigned long maxnode;
+
+ assert(policy);
+
+ if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+ return -EOPNOTSUPP;
+
+ if (!numa_policy_is_valid(policy))
+ return -EINVAL;
+
+ r = numa_policy_to_mempolicy(policy, &maxnode, &nodes);
+ if (r < 0)
+ return r;
+
+ r = set_mempolicy(numa_policy_get_type(policy), nodes, maxnode);
+ if (r < 0)
+ return -errno;
+
+ return 0;
+}
+
+int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *ret) {
+ int r;
+ size_t i;
+ _cleanup_(cpu_set_reset) CPUSet s = {};
+
+ assert(policy);
+ assert(ret);
+
+ for (i = 0; i < policy->nodes.allocated * 8; i++) {
+ _cleanup_free_ char *l = NULL;
+ char p[STRLEN("/sys/devices/system/node/node//cpulist") + DECIMAL_STR_MAX(size_t) + 1];
+ _cleanup_(cpu_set_reset) CPUSet part = {};
+
+ if (!CPU_ISSET_S(i, policy->nodes.allocated, policy->nodes.set))
+ continue;
+
+ xsprintf(p, "/sys/devices/system/node/node%zu/cpulist", i);
+
+ r = read_one_line_file(p, &l);
+ if (r < 0)
+ return r;
+
+ r = parse_cpu_set(l, &part);
+ if (r < 0)
+ return r;
+
+ r = cpu_set_add_all(&s, &part);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_STRUCT(s);
+
+ return 0;
+}
+
+static int numa_max_node(void) {
+ _cleanup_closedir_ DIR *d = NULL;
+ int r, max_node = 0;
+
+ d = opendir("/sys/devices/system/node");
+ if (!d)
+ return -errno;
+
+ FOREACH_DIRENT(de, d, break) {
+ int node;
+ const char *n;
+
+ if (de->d_type != DT_DIR)
+ continue;
+
+ n = startswith(de->d_name, "node");
+ if (!n)
+ continue;
+
+ r = safe_atoi(n, &node);
+ if (r < 0)
+ continue;
+
+ if (node > max_node)
+ max_node = node;
+ }
+
+ return max_node;
+}
+
+int numa_mask_add_all(CPUSet *mask) {
+ int m;
+
+ assert(mask);
+
+ m = numa_max_node();
+ if (m < 0) {
+ log_debug_errno(m, "Failed to determine maximum NUMA node index, assuming 1023: %m");
+ m = 1023; /* CONFIG_NODES_SHIFT is set to 10 on x86_64, i.e. 1024 NUMA nodes in total */
+ }
+
+ for (int i = 0; i <= m; i++) {
+ int r;
+
+ r = cpu_set_add(mask, i);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static const char* const mpol_table[] = {
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "preferred",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(mpol, int);
diff --git a/src/shared/numa-util.h b/src/shared/numa-util.h
new file mode 100644
index 0000000..2f736c9
--- /dev/null
+++ b/src/shared/numa-util.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "cpu-set-util.h"
+#include "missing_syscall.h"
+
+static inline bool mpol_is_valid(int t) {
+ return t >= MPOL_DEFAULT && t <= MPOL_LOCAL;
+}
+
+typedef struct NUMAPolicy {
+ /* Always use numa_policy_get_type() to read the value */
+ int type;
+ CPUSet nodes;
+} NUMAPolicy;
+
+bool numa_policy_is_valid(const NUMAPolicy *p);
+
+static inline int numa_policy_get_type(const NUMAPolicy *p) {
+ return p->type < 0 ? (p->nodes.set ? MPOL_PREFERRED : -1) : p->type;
+}
+
+static inline void numa_policy_reset(NUMAPolicy *p) {
+ assert(p);
+ cpu_set_reset(&p->nodes);
+ p->type = -1;
+}
+
+int apply_numa_policy(const NUMAPolicy *policy);
+int numa_to_cpu_set(const NUMAPolicy *policy, CPUSet *set);
+
+int numa_mask_add_all(CPUSet *mask);
+
+const char* mpol_to_string(int i) _const_;
+int mpol_from_string(const char *s) _pure_;
diff --git a/src/shared/open-file.c b/src/shared/open-file.c
new file mode 100644
index 0000000..42772bd
--- /dev/null
+++ b/src/shared/open-file.c
@@ -0,0 +1,147 @@
+
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "escape.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "open-file.h"
+#include "path-util.h"
+#include "string-table.h"
+#include "string-util.h"
+
+int open_file_parse(const char *v, OpenFile **ret) {
+ _cleanup_free_ char *options = NULL;
+ _cleanup_(open_file_freep) OpenFile *of = NULL;
+ int r;
+
+ assert(v);
+ assert(ret);
+
+ of = new0(OpenFile, 1);
+ if (!of)
+ return -ENOMEM;
+
+ r = extract_many_words(&v, ":", EXTRACT_DONT_COALESCE_SEPARATORS|EXTRACT_CUNESCAPE, &of->path, &of->fdname, &options, NULL);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ /* Enforce that at most 3 colon-separated words are present */
+ if (!isempty(v))
+ return -EINVAL;
+
+ for (const char *p = options;;) {
+ OpenFileFlag flag;
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, ",", 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
+
+ flag = open_file_flags_from_string(word);
+ if (flag < 0)
+ return flag;
+
+ if ((flag & of->flags) != 0)
+ return -EINVAL;
+
+ of->flags |= flag;
+ }
+
+ if (isempty(of->fdname)) {
+ of->fdname = mfree(of->fdname);
+ r = path_extract_filename(of->path, &of->fdname);
+ if (r < 0)
+ return r;
+ }
+
+ r = open_file_validate(of);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(of);
+
+ return 0;
+}
+
+int open_file_validate(const OpenFile *of) {
+ assert(of);
+
+ if (!path_is_valid(of->path) || !path_is_absolute(of->path))
+ return -EINVAL;
+
+ if (!fdname_is_valid(of->fdname))
+ return -EINVAL;
+
+ if ((FLAGS_SET(of->flags, OPENFILE_READ_ONLY) + FLAGS_SET(of->flags, OPENFILE_APPEND) +
+ FLAGS_SET(of->flags, OPENFILE_TRUNCATE)) > 1)
+ return -EINVAL;
+
+ if ((of->flags & ~_OPENFILE_MASK_PUBLIC) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+int open_file_to_string(const OpenFile *of, char **ret) {
+ _cleanup_free_ char *options = NULL, *fname = NULL, *s = NULL;
+ bool has_fdname = false;
+ int r;
+
+ assert(of);
+ assert(ret);
+
+ s = shell_escape(of->path, ":");
+ if (!s)
+ return -ENOMEM;
+
+ r = path_extract_filename(of->path, &fname);
+ if (r < 0)
+ return r;
+
+ has_fdname = !streq(fname, of->fdname);
+ if (has_fdname)
+ if (!strextend(&s, ":", of->fdname))
+ return -ENOMEM;
+
+ for (OpenFileFlag flag = OPENFILE_READ_ONLY; flag < _OPENFILE_MAX; flag <<= 1)
+ if (FLAGS_SET(of->flags, flag) && !strextend_with_separator(&options, ",", open_file_flags_to_string(flag)))
+ return -ENOMEM;
+
+ if (options)
+ if (!(has_fdname ? strextend(&s, ":", options) : strextend(&s, "::", options)))
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(s);
+
+ return 0;
+}
+
+OpenFile *open_file_free(OpenFile *of) {
+ if (!of)
+ return NULL;
+
+ free(of->path);
+ free(of->fdname);
+ return mfree(of);
+}
+
+void open_file_free_many(OpenFile **head) {
+ assert(head);
+
+ LIST_CLEAR(open_files, *head, open_file_free);
+}
+
+static const char * const open_file_flags_table[_OPENFILE_MAX] = {
+ [OPENFILE_READ_ONLY] = "read-only",
+ [OPENFILE_APPEND] = "append",
+ [OPENFILE_TRUNCATE] = "truncate",
+ [OPENFILE_GRACEFUL] = "graceful",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(open_file_flags, OpenFileFlag);
diff --git a/src/shared/open-file.h b/src/shared/open-file.h
new file mode 100644
index 0000000..bb63ec8
--- /dev/null
+++ b/src/shared/open-file.h
@@ -0,0 +1,36 @@
+
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "list.h"
+
+typedef enum OpenFileFlag {
+ OPENFILE_READ_ONLY = 1 << 0,
+ OPENFILE_APPEND = 1 << 1,
+ OPENFILE_TRUNCATE = 1 << 2,
+ OPENFILE_GRACEFUL = 1 << 3,
+ _OPENFILE_MAX,
+ _OPENFILE_INVALID = -EINVAL,
+ _OPENFILE_MASK_PUBLIC = OPENFILE_READ_ONLY | OPENFILE_APPEND | OPENFILE_TRUNCATE | OPENFILE_GRACEFUL,
+} OpenFileFlag;
+
+typedef struct OpenFile {
+ char *path;
+ char *fdname;
+ OpenFileFlag flags;
+ LIST_FIELDS(struct OpenFile, open_files);
+} OpenFile;
+
+int open_file_parse(const char *v, OpenFile **ret);
+
+int open_file_validate(const OpenFile *of);
+
+int open_file_to_string(const OpenFile *of, char **ret);
+
+OpenFile *open_file_free(OpenFile *of);
+DEFINE_TRIVIAL_CLEANUP_FUNC(OpenFile*, open_file_free);
+
+void open_file_free_many(OpenFile **head);
+
+const char *open_file_flags_to_string(OpenFileFlag t) _const_;
+OpenFileFlag open_file_flags_from_string(const char *t) _pure_;
diff --git a/src/shared/openssl-util.c b/src/shared/openssl-util.c
new file mode 100644
index 0000000..b0a5563
--- /dev/null
+++ b/src/shared/openssl-util.c
@@ -0,0 +1,1149 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "hexdecoct.h"
+#include "openssl-util.h"
+#include "string-util.h"
+
+#if HAVE_OPENSSL
+/* For each error in the the OpenSSL thread error queue, log the provided message and the OpenSSL error
+ * string. If there are no errors in the OpenSSL thread queue, this logs the message with "No openssl
+ * errors." This logs at level debug. Returns -EIO (or -ENOMEM). */
+#define log_openssl_errors(fmt, ...) _log_openssl_errors(UNIQ, fmt, ##__VA_ARGS__)
+#define _log_openssl_errors(u, fmt, ...) \
+ ({ \
+ size_t UNIQ_T(MAX, u) = 512 /* arbitrary, but openssl doc states it must be >= 256 */; \
+ _cleanup_free_ char *UNIQ_T(BUF, u) = malloc(UNIQ_T(MAX, u)); \
+ !UNIQ_T(BUF, u) \
+ ? log_oom_debug() \
+ : __log_openssl_errors(u, UNIQ_T(BUF, u), UNIQ_T(MAX, u), fmt, ##__VA_ARGS__) \
+ ?: log_debug_errno(SYNTHETIC_ERRNO(EIO), fmt ": No OpenSSL errors.", ##__VA_ARGS__); \
+ })
+#define __log_openssl_errors(u, buf, max, fmt, ...) \
+ ({ \
+ int UNIQ_T(R, u) = 0; \
+ for (;;) { \
+ unsigned long UNIQ_T(E, u) = ERR_get_error(); \
+ if (UNIQ_T(E, u) == 0) \
+ break; \
+ ERR_error_string_n(UNIQ_T(E, u), buf, max); \
+ UNIQ_T(R, u) = log_debug_errno(SYNTHETIC_ERRNO(EIO), fmt ": %s", ##__VA_ARGS__, buf); \
+ } \
+ UNIQ_T(R, u); \
+ })
+
+int openssl_pkey_from_pem(const void *pem, size_t pem_size, EVP_PKEY **ret) {
+ assert(pem);
+ assert(ret);
+
+ _cleanup_fclose_ FILE *f = NULL;
+ f = fmemopen((void*) pem, pem_size, "r");
+ if (!f)
+ return log_oom_debug();
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = PEM_read_PUBKEY(f, NULL, NULL, NULL);
+ if (!pkey)
+ return log_openssl_errors("Failed to parse PEM");
+
+ *ret = TAKE_PTR(pkey);
+
+ return 0;
+}
+
+/* Returns the number of bytes generated by the specified digest algorithm. This can be used only for
+ * fixed-size algorithms, e.g. md5, sha1, sha256, etc. Do not use this for variable-sized digest algorithms,
+ * e.g. shake128. Returns 0 on success, -EOPNOTSUPP if the algorithm is not supported, or < 0 for any other
+ * error. */
+int openssl_digest_size(const char *digest_alg, size_t *ret_digest_size) {
+ assert(digest_alg);
+ assert(ret_digest_size);
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL);
+#else
+ const EVP_MD *md = EVP_get_digestbyname(digest_alg);
+#endif
+ if (!md)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Digest algorithm '%s' not supported.", digest_alg);
+
+ size_t digest_size;
+#if OPENSSL_VERSION_MAJOR >= 3
+ digest_size = EVP_MD_get_size(md);
+#else
+ digest_size = EVP_MD_size(md);
+#endif
+ if (digest_size == 0)
+ return log_openssl_errors("Failed to get Digest size");
+
+ *ret_digest_size = digest_size;
+
+ return 0;
+}
+
+/* Calculate the digest hash value for the provided data, using the specified digest algorithm. Returns 0 on
+ * success, -EOPNOTSUPP if the digest algorithm is not supported, or < 0 for any other error. */
+int openssl_digest_many(
+ const char *digest_alg,
+ const struct iovec data[],
+ size_t n_data,
+ void **ret_digest,
+ size_t *ret_digest_size) {
+
+ int r;
+
+ assert(digest_alg);
+ assert(data || n_data == 0);
+ assert(ret_digest);
+ /* ret_digest_size is optional, as caller may already know the digest size */
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL);
+#else
+ const EVP_MD *md = EVP_get_digestbyname(digest_alg);
+#endif
+ if (!md)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Digest algorithm '%s' not supported.", digest_alg);
+
+ _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX *ctx = EVP_MD_CTX_new();
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_MD_CTX");
+
+ if (!EVP_DigestInit_ex(ctx, md, NULL))
+ return log_openssl_errors("Failed to initialize EVP_MD_CTX");
+
+ for (size_t i = 0; i < n_data; i++)
+ if (!EVP_DigestUpdate(ctx, data[i].iov_base, data[i].iov_len))
+ return log_openssl_errors("Failed to update Digest");
+
+ size_t digest_size;
+ r = openssl_digest_size(digest_alg, &digest_size);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ void *buf = malloc(digest_size);
+ if (!buf)
+ return log_oom_debug();
+
+ unsigned int size;
+ if (!EVP_DigestFinal_ex(ctx, buf, &size))
+ return log_openssl_errors("Failed to finalize Digest");
+
+ assert(size == digest_size);
+
+ *ret_digest = TAKE_PTR(buf);
+ if (ret_digest_size)
+ *ret_digest_size = size;
+
+ return 0;
+}
+
+/* Calculate the HMAC digest hash value for the provided data, using the provided key and specified digest
+ * algorithm. Returns 0 on success, -EOPNOTSUPP if the digest algorithm is not supported, or < 0 for any
+ * other error. */
+int openssl_hmac_many(
+ const char *digest_alg,
+ const void *key,
+ size_t key_size,
+ const struct iovec data[],
+ size_t n_data,
+ void **ret_digest,
+ size_t *ret_digest_size) {
+
+ assert(digest_alg);
+ assert(key);
+ assert(data || n_data == 0);
+ assert(ret_digest);
+ /* ret_digest_size is optional, as caller may already know the digest size */
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL);
+#else
+ const EVP_MD *md = EVP_get_digestbyname(digest_alg);
+#endif
+ if (!md)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Digest algorithm '%s' not supported.", digest_alg);
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(EVP_MAC_freep) EVP_MAC *mac = EVP_MAC_fetch(NULL, "HMAC", NULL);
+ if (!mac)
+ return log_openssl_errors("Failed to create new EVP_MAC");
+
+ _cleanup_(EVP_MAC_CTX_freep) EVP_MAC_CTX *ctx = EVP_MAC_CTX_new(mac);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_MAC_CTX");
+
+ _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new();
+ if (!bld)
+ return log_openssl_errors("Failed to create new OSSL_PARAM_BLD");
+
+ if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_MAC_PARAM_DIGEST, (char*) digest_alg, 0))
+ return log_openssl_errors("Failed to set HMAC OSSL_MAC_PARAM_DIGEST");
+
+ _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld);
+ if (!params)
+ return log_openssl_errors("Failed to build HMAC OSSL_PARAM");
+
+ if (!EVP_MAC_init(ctx, key, key_size, params))
+ return log_openssl_errors("Failed to initialize EVP_MAC_CTX");
+#else
+ _cleanup_(HMAC_CTX_freep) HMAC_CTX *ctx = HMAC_CTX_new();
+ if (!ctx)
+ return log_openssl_errors("Failed to create new HMAC_CTX");
+
+ if (!HMAC_Init_ex(ctx, key, key_size, md, NULL))
+ return log_openssl_errors("Failed to initialize HMAC_CTX");
+#endif
+
+ for (size_t i = 0; i < n_data; i++)
+#if OPENSSL_VERSION_MAJOR >= 3
+ if (!EVP_MAC_update(ctx, data[i].iov_base, data[i].iov_len))
+#else
+ if (!HMAC_Update(ctx, data[i].iov_base, data[i].iov_len))
+#endif
+ return log_openssl_errors("Failed to update HMAC");
+
+ size_t digest_size;
+#if OPENSSL_VERSION_MAJOR >= 3
+ digest_size = EVP_MAC_CTX_get_mac_size(ctx);
+#else
+ digest_size = HMAC_size(ctx);
+#endif
+ if (digest_size == 0)
+ return log_openssl_errors("Failed to get HMAC digest size");
+
+ _cleanup_free_ void *buf = malloc(digest_size);
+ if (!buf)
+ return log_oom_debug();
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ size_t size;
+ if (!EVP_MAC_final(ctx, buf, &size, digest_size))
+#else
+ unsigned int size;
+ if (!HMAC_Final(ctx, buf, &size))
+#endif
+ return log_openssl_errors("Failed to finalize HMAC");
+
+ assert(size == digest_size);
+
+ *ret_digest = TAKE_PTR(buf);
+ if (ret_digest_size)
+ *ret_digest_size = size;
+
+ return 0;
+}
+
+/* Symmetric Cipher encryption using the alg-bits-mode cipher, e.g. AES-128-CFB. The key is required and must
+ * be at least the minimum required key length for the cipher. The IV is optional but, if provided, it must
+ * be at least the minimum iv length for the cipher. If no IV is provided and the cipher requires one, a
+ * buffer of zeroes is used. Returns 0 on success, -EOPNOTSUPP if the cipher algorithm is not supported, or <
+ * 0 on any other error. */
+int openssl_cipher_many(
+ const char *alg,
+ size_t bits,
+ const char *mode,
+ const void *key,
+ size_t key_size,
+ const void *iv,
+ size_t iv_size,
+ const struct iovec data[],
+ size_t n_data,
+ void **ret,
+ size_t *ret_size) {
+
+ assert(alg);
+ assert(bits > 0);
+ assert(mode);
+ assert(key);
+ assert(iv || iv_size == 0);
+ assert(data || n_data == 0);
+ assert(ret);
+ assert(ret_size);
+
+ _cleanup_free_ char *cipher_alg = NULL;
+ if (asprintf(&cipher_alg, "%s-%zu-%s", alg, bits, mode) < 0)
+ return log_oom_debug();
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(EVP_CIPHER_freep) EVP_CIPHER *cipher = EVP_CIPHER_fetch(NULL, cipher_alg, NULL);
+#else
+ const EVP_CIPHER *cipher = EVP_get_cipherbyname(cipher_alg);
+#endif
+ if (!cipher)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Cipher algorithm '%s' not supported.", cipher_alg);
+
+ _cleanup_(EVP_CIPHER_CTX_freep) EVP_CIPHER_CTX *ctx = EVP_CIPHER_CTX_new();
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_CIPHER_CTX");
+
+ /* Verify enough key data was provided. */
+ int cipher_key_length = EVP_CIPHER_key_length(cipher);
+ assert(cipher_key_length >= 0);
+ if ((size_t) cipher_key_length > key_size)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Not enough key bytes provided, require %d", cipher_key_length);
+
+ /* Verify enough IV data was provided or, if no IV was provided, use a zeroed buffer for IV data. */
+ int cipher_iv_length = EVP_CIPHER_iv_length(cipher);
+ assert(cipher_iv_length >= 0);
+ _cleanup_free_ void *zero_iv = NULL;
+ if (iv_size == 0) {
+ zero_iv = malloc0(cipher_iv_length);
+ if (!zero_iv)
+ return log_oom_debug();
+
+ iv = zero_iv;
+ iv_size = (size_t) cipher_iv_length;
+ }
+ if ((size_t) cipher_iv_length > iv_size)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Not enough IV bytes provided, require %d", cipher_iv_length);
+
+ if (!EVP_EncryptInit(ctx, cipher, key, iv))
+ return log_openssl_errors("Failed to initialize EVP_CIPHER_CTX.");
+
+ int cipher_block_size = EVP_CIPHER_CTX_block_size(ctx);
+ assert(cipher_block_size > 0);
+
+ _cleanup_free_ uint8_t *buf = NULL;
+ size_t size = 0;
+
+ for (size_t i = 0; i < n_data; i++) {
+ /* Cipher may produce (up to) input length + cipher block size of output. */
+ if (!GREEDY_REALLOC(buf, size + data[i].iov_len + cipher_block_size))
+ return log_oom_debug();
+
+ int update_size;
+ if (!EVP_EncryptUpdate(ctx, &buf[size], &update_size, data[i].iov_base, data[i].iov_len))
+ return log_openssl_errors("Failed to update Cipher.");
+
+ size += update_size;
+ }
+
+ if (!GREEDY_REALLOC(buf, size + cipher_block_size))
+ return log_oom_debug();
+
+ int final_size;
+ if (!EVP_EncryptFinal_ex(ctx, &buf[size], &final_size))
+ return log_openssl_errors("Failed to finalize Cipher.");
+
+ *ret = TAKE_PTR(buf);
+ *ret_size = size + final_size;
+
+ return 0;
+}
+
+/* Perform Single-Step (aka "Concat") KDF. Currently, this only supports using the digest for the auxiliary
+ * function. The derive_size parameter specifies how many bytes are derived.
+ *
+ * For more details see: https://www.openssl.org/docs/manmaster/man7/EVP_KDF-SS.html */
+int kdf_ss_derive(
+ const char *digest,
+ const void *key,
+ size_t key_size,
+ const void *salt,
+ size_t salt_size,
+ const void *info,
+ size_t info_size,
+ size_t derive_size,
+ void **ret) {
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ assert(digest);
+ assert(key);
+ assert(derive_size > 0);
+ assert(ret);
+
+ _cleanup_(EVP_KDF_freep) EVP_KDF *kdf = EVP_KDF_fetch(NULL, "SSKDF", NULL);
+ if (!kdf)
+ return log_openssl_errors("Failed to create new EVP_KDF");
+
+ _cleanup_(EVP_KDF_CTX_freep) EVP_KDF_CTX *ctx = EVP_KDF_CTX_new(kdf);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_KDF_CTX");
+
+ _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new();
+ if (!bld)
+ return log_openssl_errors("Failed to create new OSSL_PARAM_BLD");
+
+ _cleanup_free_ void *buf = malloc(derive_size);
+ if (!buf)
+ return log_oom_debug();
+
+ if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_DIGEST, (char*) digest, 0))
+ return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_DIGEST");
+
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_KEY, (char*) key, key_size))
+ return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_KEY");
+
+ if (salt)
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_SALT, (char*) salt, salt_size))
+ return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_SALT");
+
+ if (info)
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_INFO, (char*) info, info_size))
+ return log_openssl_errors("Failed to add KDF-SS OSSL_KDF_PARAM_INFO");
+
+ _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld);
+ if (!params)
+ return log_openssl_errors("Failed to build KDF-SS OSSL_PARAM");
+
+ if (EVP_KDF_derive(ctx, buf, derive_size, params) <= 0)
+ return log_openssl_errors("OpenSSL KDF-SS derive failed");
+
+ *ret = TAKE_PTR(buf);
+
+ return 0;
+#else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "KDF-SS requires OpenSSL >= 3.");
+#endif
+}
+
+/* Perform Key-Based HMAC KDF. The mode must be "COUNTER" or "FEEDBACK". The parameter naming is from the
+ * OpenSSL api, and maps to SP800-108 naming as "...key, salt, info, and seed correspond to KI, Label,
+ * Context, and IV (respectively)...". The derive_size parameter specifies how many bytes are derived.
+ *
+ * For more details see: https://www.openssl.org/docs/manmaster/man7/EVP_KDF-KB.html */
+int kdf_kb_hmac_derive(
+ const char *mode,
+ const char *digest,
+ const void *key,
+ size_t key_size,
+ const void *salt,
+ size_t salt_size,
+ const void *info,
+ size_t info_size,
+ const void *seed,
+ size_t seed_size,
+ size_t derive_size,
+ void **ret) {
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ assert(mode);
+ assert(strcaseeq(mode, "COUNTER") || strcaseeq(mode, "FEEDBACK"));
+ assert(digest);
+ assert(key || key_size == 0);
+ assert(salt || salt_size == 0);
+ assert(info || info_size == 0);
+ assert(seed || seed_size == 0);
+ assert(derive_size > 0);
+ assert(ret);
+
+ _cleanup_(EVP_KDF_freep) EVP_KDF *kdf = EVP_KDF_fetch(NULL, "KBKDF", NULL);
+ if (!kdf)
+ return log_openssl_errors("Failed to create new EVP_KDF");
+
+ _cleanup_(EVP_KDF_CTX_freep) EVP_KDF_CTX *ctx = EVP_KDF_CTX_new(kdf);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_KDF_CTX");
+
+ _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new();
+ if (!bld)
+ return log_openssl_errors("Failed to create new OSSL_PARAM_BLD");
+
+ if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_MAC, (char*) "HMAC", 0))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_MAC");
+
+ if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_MODE, (char*) mode, 0))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_MODE");
+
+ if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_KDF_PARAM_DIGEST, (char*) digest, 0))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_DIGEST");
+
+ if (key)
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_KEY, (char*) key, key_size))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_KEY");
+
+ if (salt)
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_SALT, (char*) salt, salt_size))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_SALT");
+
+ if (info)
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_INFO, (char*) info, info_size))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_INFO");
+
+ if (seed)
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_KDF_PARAM_SEED, (char*) seed, seed_size))
+ return log_openssl_errors("Failed to add KDF-KB OSSL_KDF_PARAM_SEED");
+
+ _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld);
+ if (!params)
+ return log_openssl_errors("Failed to build KDF-KB OSSL_PARAM");
+
+ _cleanup_free_ void *buf = malloc(derive_size);
+ if (!buf)
+ return log_oom_debug();
+
+ if (EVP_KDF_derive(ctx, buf, derive_size, params) <= 0)
+ return log_openssl_errors("OpenSSL KDF-KB derive failed");
+
+ *ret = TAKE_PTR(buf);
+
+ return 0;
+#else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "KDF-KB requires OpenSSL >= 3.");
+#endif
+}
+
+int rsa_encrypt_bytes(
+ EVP_PKEY *pkey,
+ const void *decrypted_key,
+ size_t decrypted_key_size,
+ void **ret_encrypt_key,
+ size_t *ret_encrypt_key_size) {
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = NULL;
+ _cleanup_free_ void *b = NULL;
+ size_t l;
+
+ ctx = EVP_PKEY_CTX_new(pkey, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to allocate public key context");
+
+ if (EVP_PKEY_encrypt_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize public key context");
+
+ if (EVP_PKEY_CTX_set_rsa_padding(ctx, RSA_PKCS1_PADDING) <= 0)
+ return log_openssl_errors("Failed to configure PKCS#1 padding");
+
+ if (EVP_PKEY_encrypt(ctx, NULL, &l, decrypted_key, decrypted_key_size) <= 0)
+ return log_openssl_errors("Failed to determine encrypted key size");
+
+ b = malloc(l);
+ if (!b)
+ return -ENOMEM;
+
+ if (EVP_PKEY_encrypt(ctx, b, &l, decrypted_key, decrypted_key_size) <= 0)
+ return log_openssl_errors("Failed to determine encrypted key size");
+
+ *ret_encrypt_key = TAKE_PTR(b);
+ *ret_encrypt_key_size = l;
+
+ return 0;
+}
+
+/* Encrypt the key data using RSA-OAEP with the provided label and specified digest algorithm. Returns 0 on
+ * success, -EOPNOTSUPP if the digest algorithm is not supported, or < 0 for any other error. */
+int rsa_oaep_encrypt_bytes(
+ const EVP_PKEY *pkey,
+ const char *digest_alg,
+ const char *label,
+ const void *decrypted_key,
+ size_t decrypted_key_size,
+ void **ret_encrypt_key,
+ size_t *ret_encrypt_key_size) {
+
+ assert(pkey);
+ assert(digest_alg);
+ assert(label);
+ assert(decrypted_key);
+ assert(decrypted_key_size > 0);
+ assert(ret_encrypt_key);
+ assert(ret_encrypt_key_size);
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(EVP_MD_freep) EVP_MD *md = EVP_MD_fetch(NULL, digest_alg, NULL);
+#else
+ const EVP_MD *md = EVP_get_digestbyname(digest_alg);
+#endif
+ if (!md)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Digest algorithm '%s' not supported.", digest_alg);
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new((EVP_PKEY*) pkey, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_PKEY_CTX");
+
+ if (EVP_PKEY_encrypt_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize EVP_PKEY_CTX");
+
+ if (EVP_PKEY_CTX_set_rsa_padding(ctx, RSA_PKCS1_OAEP_PADDING) <= 0)
+ return log_openssl_errors("Failed to configure RSA-OAEP padding");
+
+ if (EVP_PKEY_CTX_set_rsa_oaep_md(ctx, md) <= 0)
+ return log_openssl_errors("Failed to configure RSA-OAEP MD");
+
+ _cleanup_free_ char *duplabel = strdup(label);
+ if (!duplabel)
+ return log_oom_debug();
+
+ if (EVP_PKEY_CTX_set0_rsa_oaep_label(ctx, duplabel, strlen(duplabel) + 1) <= 0)
+ return log_openssl_errors("Failed to configure RSA-OAEP label");
+ /* ctx owns this now, don't free */
+ TAKE_PTR(duplabel);
+
+ size_t size = 0;
+ if (EVP_PKEY_encrypt(ctx, NULL, &size, decrypted_key, decrypted_key_size) <= 0)
+ return log_openssl_errors("Failed to determine RSA-OAEP encrypted key size");
+
+ _cleanup_free_ void *buf = malloc(size);
+ if (!buf)
+ return log_oom_debug();
+
+ if (EVP_PKEY_encrypt(ctx, buf, &size, decrypted_key, decrypted_key_size) <= 0)
+ return log_openssl_errors("Failed to RSA-OAEP encrypt");
+
+ *ret_encrypt_key = TAKE_PTR(buf);
+ *ret_encrypt_key_size = size;
+
+ return 0;
+}
+
+int rsa_pkey_to_suitable_key_size(
+ EVP_PKEY *pkey,
+ size_t *ret_suitable_key_size) {
+
+ size_t suitable_key_size;
+ int bits;
+
+ assert(pkey);
+ assert(ret_suitable_key_size);
+
+ /* Analyzes the specified public key and that it is RSA. If so, will return a suitable size for a
+ * disk encryption key to encrypt with RSA for use in PKCS#11 security token schemes. */
+
+ if (EVP_PKEY_base_id(pkey) != EVP_PKEY_RSA)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "X.509 certificate does not refer to RSA key.");
+
+ bits = EVP_PKEY_bits(pkey);
+ log_debug("Bits in RSA key: %i", bits);
+
+ /* We use PKCS#1 padding for the RSA cleartext, hence let's leave some extra space for it, hence only
+ * generate a random key half the size of the RSA length */
+ suitable_key_size = bits / 8 / 2;
+
+ if (suitable_key_size < 1)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Uh, RSA key size too short?");
+
+ *ret_suitable_key_size = suitable_key_size;
+ return 0;
+}
+
+/* Generate RSA public key from provided "n" and "e" values. Note that if "e" is a number (e.g. uint32_t), it
+ * must be provided here big-endian, e.g. wrap it with htobe32(). */
+int rsa_pkey_from_n_e(const void *n, size_t n_size, const void *e, size_t e_size, EVP_PKEY **ret) {
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+
+ assert(n);
+ assert(e);
+ assert(ret);
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_RSA, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_PKEY_CTX");
+
+ _cleanup_(BN_freep) BIGNUM *bn_n = BN_bin2bn(n, n_size, NULL);
+ if (!bn_n)
+ return log_openssl_errors("Failed to create BIGNUM for RSA n");
+
+ _cleanup_(BN_freep) BIGNUM *bn_e = BN_bin2bn(e, e_size, NULL);
+ if (!bn_e)
+ return log_openssl_errors("Failed to create BIGNUM for RSA e");
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ if (EVP_PKEY_fromdata_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize EVP_PKEY_CTX");
+
+ _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new();
+ if (!bld)
+ return log_openssl_errors("Failed to create new OSSL_PARAM_BLD");
+
+ if (!OSSL_PARAM_BLD_push_BN(bld, OSSL_PKEY_PARAM_RSA_N, bn_n))
+ return log_openssl_errors("Failed to set RSA OSSL_PKEY_PARAM_RSA_N");
+
+ if (!OSSL_PARAM_BLD_push_BN(bld, OSSL_PKEY_PARAM_RSA_E, bn_e))
+ return log_openssl_errors("Failed to set RSA OSSL_PKEY_PARAM_RSA_E");
+
+ _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld);
+ if (!params)
+ return log_openssl_errors("Failed to build RSA OSSL_PARAM");
+
+ if (EVP_PKEY_fromdata(ctx, &pkey, EVP_PKEY_PUBLIC_KEY, params) <= 0)
+ return log_openssl_errors("Failed to create RSA EVP_PKEY");
+#else
+ _cleanup_(RSA_freep) RSA *rsa_key = RSA_new();
+ if (!rsa_key)
+ return log_openssl_errors("Failed to create new RSA");
+
+ if (!RSA_set0_key(rsa_key, bn_n, bn_e, NULL))
+ return log_openssl_errors("Failed to set RSA n/e");
+ /* rsa_key owns these now, don't free */
+ TAKE_PTR(bn_n);
+ TAKE_PTR(bn_e);
+
+ pkey = EVP_PKEY_new();
+ if (!pkey)
+ return log_openssl_errors("Failed to create new EVP_PKEY");
+
+ if (!EVP_PKEY_assign_RSA(pkey, rsa_key))
+ return log_openssl_errors("Failed to assign RSA key");
+ /* pkey owns this now, don't free */
+ TAKE_PTR(rsa_key);
+#endif
+
+ *ret = TAKE_PTR(pkey);
+
+ return 0;
+}
+
+/* Get the "n" and "e" values from the pkey. The values are returned in "bin" format, i.e. BN_bn2bin(). */
+int rsa_pkey_to_n_e(
+ const EVP_PKEY *pkey,
+ void **ret_n,
+ size_t *ret_n_size,
+ void **ret_e,
+ size_t *ret_e_size) {
+
+ assert(pkey);
+ assert(ret_n);
+ assert(ret_n_size);
+ assert(ret_e);
+ assert(ret_e_size);
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ _cleanup_(BN_freep) BIGNUM *bn_n = NULL;
+ if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_RSA_N, &bn_n))
+ return log_openssl_errors("Failed to get RSA n");
+
+ _cleanup_(BN_freep) BIGNUM *bn_e = NULL;
+ if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_RSA_E, &bn_e))
+ return log_openssl_errors("Failed to get RSA e");
+#else
+ const RSA *rsa = EVP_PKEY_get0_RSA((EVP_PKEY*) pkey);
+ if (!rsa)
+ return log_openssl_errors("Failed to get RSA key from public key");
+
+ const BIGNUM *bn_n = RSA_get0_n(rsa);
+ if (!bn_n)
+ return log_openssl_errors("Failed to get RSA n");
+
+ const BIGNUM *bn_e = RSA_get0_e(rsa);
+ if (!bn_e)
+ return log_openssl_errors("Failed to get RSA e");
+#endif
+
+ size_t n_size = BN_num_bytes(bn_n), e_size = BN_num_bytes(bn_e);
+ _cleanup_free_ void *n = malloc(n_size), *e = malloc(e_size);
+ if (!n || !e)
+ return log_oom_debug();
+
+ assert(BN_bn2bin(bn_n, n) == (int) n_size);
+ assert(BN_bn2bin(bn_e, e) == (int) e_size);
+
+ *ret_n = TAKE_PTR(n);
+ *ret_n_size = n_size;
+ *ret_e = TAKE_PTR(e);
+ *ret_e_size = e_size;
+
+ return 0;
+}
+
+/* Generate a new RSA key with the specified number of bits. */
+int rsa_pkey_new(size_t bits, EVP_PKEY **ret) {
+ assert(ret);
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_RSA, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_PKEY_CTX");
+
+ if (EVP_PKEY_keygen_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize EVP_PKEY_CTX");
+
+ if (EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, (int) bits) <= 0)
+ return log_openssl_errors("Failed to set RSA bits to %zu", bits);
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+ if (EVP_PKEY_keygen(ctx, &pkey) <= 0)
+ return log_openssl_errors("Failed to generate ECC key");
+
+ *ret = TAKE_PTR(pkey);
+
+ return 0;
+}
+
+/* Generate ECC public key from provided curve ID and x/y points. */
+int ecc_pkey_from_curve_x_y(
+ int curve_id,
+ const void *x,
+ size_t x_size,
+ const void *y,
+ size_t y_size,
+ EVP_PKEY **ret) {
+
+ assert(x);
+ assert(y);
+ assert(ret);
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_EC, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_PKEY_CTX");
+
+ _cleanup_(BN_freep) BIGNUM *bn_x = BN_bin2bn(x, x_size, NULL);
+ if (!bn_x)
+ return log_openssl_errors("Failed to create BIGNUM x");
+
+ _cleanup_(BN_freep) BIGNUM *bn_y = BN_bin2bn(y, y_size, NULL);
+ if (!bn_y)
+ return log_openssl_errors("Failed to create BIGNUM y");
+
+ _cleanup_(EC_GROUP_freep) EC_GROUP *group = EC_GROUP_new_by_curve_name(curve_id);
+ if (!group)
+ return log_openssl_errors("ECC curve id %d not supported", curve_id);
+
+ _cleanup_(EC_POINT_freep) EC_POINT *point = EC_POINT_new(group);
+ if (!point)
+ return log_openssl_errors("Failed to create new EC_POINT");
+
+ if (!EC_POINT_set_affine_coordinates(group, point, bn_x, bn_y, NULL))
+ return log_openssl_errors("Failed to set ECC coordinates");
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ if (EVP_PKEY_fromdata_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize EVP_PKEY_CTX");
+
+ _cleanup_(OSSL_PARAM_BLD_freep) OSSL_PARAM_BLD *bld = OSSL_PARAM_BLD_new();
+ if (!bld)
+ return log_openssl_errors("Failed to create new OSSL_PARAM_BLD");
+
+ if (!OSSL_PARAM_BLD_push_utf8_string(bld, OSSL_PKEY_PARAM_GROUP_NAME, (char*) OSSL_EC_curve_nid2name(curve_id), 0))
+ return log_openssl_errors("Failed to add ECC OSSL_PKEY_PARAM_GROUP_NAME");
+
+ _cleanup_(OPENSSL_freep) void *pbuf = NULL;
+ size_t pbuf_len = 0;
+ pbuf_len = EC_POINT_point2buf(group, point, POINT_CONVERSION_UNCOMPRESSED, (unsigned char**) &pbuf, NULL);
+ if (pbuf_len == 0)
+ return log_openssl_errors("Failed to convert ECC point to buffer");
+
+ if (!OSSL_PARAM_BLD_push_octet_string(bld, OSSL_PKEY_PARAM_PUB_KEY, pbuf, pbuf_len))
+ return log_openssl_errors("Failed to add ECC OSSL_PKEY_PARAM_PUB_KEY");
+
+ _cleanup_(OSSL_PARAM_freep) OSSL_PARAM *params = OSSL_PARAM_BLD_to_param(bld);
+ if (!params)
+ return log_openssl_errors("Failed to build ECC OSSL_PARAM");
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+ if (EVP_PKEY_fromdata(ctx, &pkey, EVP_PKEY_PUBLIC_KEY, params) <= 0)
+ return log_openssl_errors("Failed to create ECC EVP_PKEY");
+#else
+ _cleanup_(EC_KEY_freep) EC_KEY *eckey = EC_KEY_new();
+ if (!eckey)
+ return log_openssl_errors("Failed to create new EC_KEY");
+
+ if (!EC_KEY_set_group(eckey, group))
+ return log_openssl_errors("Failed to set ECC group");
+
+ if (!EC_KEY_set_public_key(eckey, point))
+ return log_openssl_errors("Failed to set ECC point");
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = EVP_PKEY_new();
+ if (!pkey)
+ return log_openssl_errors("Failed to create new EVP_PKEY");
+
+ if (!EVP_PKEY_assign_EC_KEY(pkey, eckey))
+ return log_openssl_errors("Failed to assign ECC key");
+ /* pkey owns this now, don't free */
+ TAKE_PTR(eckey);
+#endif
+
+ *ret = TAKE_PTR(pkey);
+
+ return 0;
+}
+
+int ecc_pkey_to_curve_x_y(
+ const EVP_PKEY *pkey,
+ int *ret_curve_id,
+ void **ret_x,
+ size_t *ret_x_size,
+ void **ret_y,
+ size_t *ret_y_size) {
+
+ _cleanup_(BN_freep) BIGNUM *bn_x = NULL, *bn_y = NULL;
+ int curve_id;
+
+ assert(pkey);
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ size_t name_size;
+ if (!EVP_PKEY_get_utf8_string_param(pkey, OSSL_PKEY_PARAM_GROUP_NAME, NULL, 0, &name_size))
+ return log_openssl_errors("Failed to get ECC group name size");
+
+ _cleanup_free_ char *name = new(char, name_size + 1);
+ if (!name)
+ return log_oom_debug();
+
+ if (!EVP_PKEY_get_utf8_string_param(pkey, OSSL_PKEY_PARAM_GROUP_NAME, name, name_size + 1, NULL))
+ return log_openssl_errors("Failed to get ECC group name");
+
+ curve_id = OBJ_sn2nid(name);
+ if (curve_id == NID_undef)
+ return log_openssl_errors("Failed to get ECC curve id");
+
+ if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_EC_PUB_X, &bn_x))
+ return log_openssl_errors("Failed to get ECC point x");
+
+ if (!EVP_PKEY_get_bn_param(pkey, OSSL_PKEY_PARAM_EC_PUB_Y, &bn_y))
+ return log_openssl_errors("Failed to get ECC point y");
+#else
+ const EC_KEY *eckey = EVP_PKEY_get0_EC_KEY((EVP_PKEY*) pkey);
+ if (!eckey)
+ return log_openssl_errors("Failed to get EC_KEY");
+
+ const EC_GROUP *group = EC_KEY_get0_group(eckey);
+ if (!group)
+ return log_openssl_errors("Failed to get EC_GROUP");
+
+ curve_id = EC_GROUP_get_curve_name(group);
+ if (curve_id == NID_undef)
+ return log_openssl_errors("Failed to get ECC curve id");
+
+ const EC_POINT *point = EC_KEY_get0_public_key(eckey);
+ if (!point)
+ return log_openssl_errors("Failed to get EC_POINT");
+
+ bn_x = BN_new();
+ bn_y = BN_new();
+ if (!bn_x || !bn_y)
+ return log_openssl_errors("Failed to create new BIGNUM");
+
+ if (!EC_POINT_get_affine_coordinates(group, point, bn_x, bn_y, NULL))
+ return log_openssl_errors("Failed to get ECC x/y.");
+#endif
+
+ size_t x_size = BN_num_bytes(bn_x), y_size = BN_num_bytes(bn_y);
+ _cleanup_free_ void *x = malloc(x_size), *y = malloc(y_size);
+ if (!x || !y)
+ return log_oom_debug();
+
+ assert(BN_bn2bin(bn_x, x) == (int) x_size);
+ assert(BN_bn2bin(bn_y, y) == (int) y_size);
+
+ if (ret_curve_id)
+ *ret_curve_id = curve_id;
+ if (ret_x)
+ *ret_x = TAKE_PTR(x);
+ if (ret_x_size)
+ *ret_x_size = x_size;
+ if (ret_y)
+ *ret_y = TAKE_PTR(y);
+ if (ret_y_size)
+ *ret_y_size = y_size;
+
+ return 0;
+}
+
+/* Generate a new ECC key for the specified ECC curve id. */
+int ecc_pkey_new(int curve_id, EVP_PKEY **ret) {
+ assert(ret);
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new_id(EVP_PKEY_EC, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_PKEY_CTX");
+
+ if (EVP_PKEY_keygen_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize EVP_PKEY_CTX");
+
+ if (EVP_PKEY_CTX_set_ec_paramgen_curve_nid(ctx, curve_id) <= 0)
+ return log_openssl_errors("Failed to set ECC curve %d", curve_id);
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+ if (EVP_PKEY_keygen(ctx, &pkey) <= 0)
+ return log_openssl_errors("Failed to generate ECC key");
+
+ *ret = TAKE_PTR(pkey);
+
+ return 0;
+}
+
+/* Perform ECDH to derive an ECC shared secret between the provided private key and public peer key. For two
+ * keys, this will result in the same shared secret in either direction; ECDH using Alice's private key and
+ * Bob's public (peer) key will result in the same shared secret as ECDH using Bob's private key and Alice's
+ * public (peer) key. On success, this returns 0 and provides the shared secret; otherwise this returns an
+ * error. */
+int ecc_ecdh(const EVP_PKEY *private_pkey,
+ const EVP_PKEY *peer_pkey,
+ void **ret_shared_secret,
+ size_t *ret_shared_secret_size) {
+
+ assert(private_pkey);
+ assert(peer_pkey);
+ assert(ret_shared_secret);
+ assert(ret_shared_secret_size);
+
+ _cleanup_(EVP_PKEY_CTX_freep) EVP_PKEY_CTX *ctx = EVP_PKEY_CTX_new((EVP_PKEY*) private_pkey, NULL);
+ if (!ctx)
+ return log_openssl_errors("Failed to create new EVP_PKEY_CTX");
+
+ if (EVP_PKEY_derive_init(ctx) <= 0)
+ return log_openssl_errors("Failed to initialize EVP_PKEY_CTX");
+
+ if (EVP_PKEY_derive_set_peer(ctx, (EVP_PKEY*) peer_pkey) <= 0)
+ return log_openssl_errors("Failed to set ECC derive peer");
+
+ size_t shared_secret_size;
+ if (EVP_PKEY_derive(ctx, NULL, &shared_secret_size) <= 0)
+ return log_openssl_errors("Failed to get ECC shared secret size");
+
+ _cleanup_free_ void *shared_secret = malloc(shared_secret_size);
+ if (!shared_secret)
+ return log_oom_debug();
+
+ if (EVP_PKEY_derive(ctx, (unsigned char*) shared_secret, &shared_secret_size) <= 0)
+ return log_openssl_errors("Failed to derive ECC shared secret");
+
+ *ret_shared_secret = TAKE_PTR(shared_secret);
+ *ret_shared_secret_size = shared_secret_size;
+
+ return 0;
+}
+
+int pubkey_fingerprint(EVP_PKEY *pk, const EVP_MD *md, void **ret, size_t *ret_size) {
+ _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX* m = NULL;
+ _cleanup_free_ void *d = NULL, *h = NULL;
+ int sz, lsz, msz;
+ unsigned umsz;
+ unsigned char *dd;
+
+ /* Calculates a message digest of the DER encoded public key */
+
+ assert(pk);
+ assert(md);
+ assert(ret);
+ assert(ret_size);
+
+ sz = i2d_PublicKey(pk, NULL);
+ if (sz < 0)
+ return log_openssl_errors("Unable to convert public key to DER format");
+
+ dd = d = malloc(sz);
+ if (!d)
+ return log_oom_debug();
+
+ lsz = i2d_PublicKey(pk, &dd);
+ if (lsz < 0)
+ return log_openssl_errors("Unable to convert public key to DER format");
+
+ m = EVP_MD_CTX_new();
+ if (!m)
+ return log_openssl_errors("Failed to create new EVP_MD_CTX");
+
+ if (EVP_DigestInit_ex(m, md, NULL) != 1)
+ return log_openssl_errors("Failed to initialize %s context", EVP_MD_name(md));
+
+ if (EVP_DigestUpdate(m, d, lsz) != 1)
+ return log_openssl_errors("Failed to run %s context", EVP_MD_name(md));
+
+ msz = EVP_MD_size(md);
+ assert(msz > 0);
+
+ h = malloc(msz);
+ if (!h)
+ return log_oom_debug();
+
+ umsz = msz;
+ if (EVP_DigestFinal_ex(m, h, &umsz) != 1)
+ return log_openssl_errors("Failed to finalize hash context");
+
+ assert(umsz == (unsigned) msz);
+
+ *ret = TAKE_PTR(h);
+ *ret_size = msz;
+
+ return 0;
+}
+
+int digest_and_sign(
+ const EVP_MD *md,
+ EVP_PKEY *privkey,
+ const void *data, size_t size,
+ void **ret, size_t *ret_size) {
+
+ assert(privkey);
+ assert(ret);
+ assert(ret_size);
+
+ if (size == 0)
+ data = ""; /* make sure to pass a valid pointer to OpenSSL */
+ else {
+ assert(data);
+
+ if (size == SIZE_MAX) /* If SIZE_MAX input is a string whose size we determine automatically */
+ size = strlen(data);
+ }
+
+ _cleanup_(EVP_MD_CTX_freep) EVP_MD_CTX* mdctx = EVP_MD_CTX_new();
+ if (!mdctx)
+ return log_openssl_errors("Failed to create new EVP_MD_CTX");
+
+ if (EVP_DigestSignInit(mdctx, NULL, md, NULL, privkey) != 1)
+ return log_openssl_errors("Failed to initialize signature context");
+
+ /* Determine signature size */
+ size_t ss;
+ if (EVP_DigestSign(mdctx, NULL, &ss, data, size) != 1)
+ return log_openssl_errors("Failed to determine size of signature");
+
+ _cleanup_free_ void *sig = malloc(ss);
+ if (!sig)
+ return log_oom_debug();
+
+ if (EVP_DigestSign(mdctx, sig, &ss, data, size) != 1)
+ return log_openssl_errors("Failed to sign data");
+
+ *ret = TAKE_PTR(sig);
+ *ret_size = ss;
+ return 0;
+}
+
+# if PREFER_OPENSSL
+int string_hashsum(
+ const char *s,
+ size_t len,
+ const char *md_algorithm,
+ char **ret) {
+
+ _cleanup_free_ void *hash = NULL;
+ size_t hash_size;
+ _cleanup_free_ char *enc = NULL;
+ int r;
+
+ assert(s || len == 0);
+ assert(md_algorithm);
+ assert(ret);
+
+ r = openssl_digest(md_algorithm, s, len, &hash, &hash_size);
+ if (r < 0)
+ return r;
+
+ enc = hexmem(hash, hash_size);
+ if (!enc)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(enc);
+ return 0;
+}
+# endif
+#endif
+
+int x509_fingerprint(X509 *cert, uint8_t buffer[static SHA256_DIGEST_SIZE]) {
+#if HAVE_OPENSSL
+ _cleanup_free_ uint8_t *der = NULL;
+ int dersz;
+
+ assert(cert);
+
+ dersz = i2d_X509(cert, &der);
+ if (dersz < 0)
+ return log_openssl_errors("Unable to convert PEM certificate to DER format");
+
+ sha256_direct(der, dersz, buffer);
+ return 0;
+#else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL is not supported, cannot calculate X509 fingerprint: %m");
+#endif
+}
diff --git a/src/shared/openssl-util.h b/src/shared/openssl-util.h
new file mode 100644
index 0000000..e3f34a8
--- /dev/null
+++ b/src/shared/openssl-util.h
@@ -0,0 +1,167 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "iovec-util.h"
+#include "macro.h"
+#include "sha256.h"
+
+#define X509_FINGERPRINT_SIZE SHA256_DIGEST_SIZE
+
+#if HAVE_OPENSSL
+# include <openssl/bio.h>
+# include <openssl/bn.h>
+# include <openssl/crypto.h>
+# include <openssl/err.h>
+# include <openssl/evp.h>
+# include <openssl/opensslv.h>
+# include <openssl/pkcs7.h>
+# include <openssl/ssl.h>
+# include <openssl/x509v3.h>
+# ifndef OPENSSL_VERSION_MAJOR
+/* OPENSSL_VERSION_MAJOR macro was added in OpenSSL 3. Thus, if it doesn't exist, we must be before OpenSSL 3. */
+# define OPENSSL_VERSION_MAJOR 1
+# endif
+# if OPENSSL_VERSION_MAJOR >= 3
+# include <openssl/core_names.h>
+# include <openssl/kdf.h>
+# include <openssl/param_build.h>
+# endif
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL_MACRO(void*, OPENSSL_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(X509_NAME*, X509_NAME_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_PKEY_CTX*, EVP_PKEY_CTX_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_CIPHER_CTX*, EVP_CIPHER_CTX_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_POINT*, EC_POINT_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_GROUP*, EC_GROUP_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(BIGNUM*, BN_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(BN_CTX*, BN_CTX_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(ECDSA_SIG*, ECDSA_SIG_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(PKCS7*, PKCS7_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(SSL*, SSL_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(BIO*, BIO_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MD_CTX*, EVP_MD_CTX_free, NULL);
+#if OPENSSL_VERSION_MAJOR >= 3
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_CIPHER*, EVP_CIPHER_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_KDF*, EVP_KDF_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_KDF_CTX*, EVP_KDF_CTX_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MAC*, EVP_MAC_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MAC_CTX*, EVP_MAC_CTX_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_MD*, EVP_MD_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OSSL_PARAM*, OSSL_PARAM_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(OSSL_PARAM_BLD*, OSSL_PARAM_BLD_free, NULL);
+#else
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EC_KEY*, EC_KEY_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(HMAC_CTX*, HMAC_CTX_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(RSA*, RSA_free, NULL);
+#endif
+
+static inline void sk_X509_free_allp(STACK_OF(X509) **sk) {
+ if (!sk || !*sk)
+ return;
+
+ sk_X509_pop_free(*sk, X509_free);
+}
+
+int openssl_pkey_from_pem(const void *pem, size_t pem_size, EVP_PKEY **ret);
+
+int openssl_digest_size(const char *digest_alg, size_t *ret_digest_size);
+
+int openssl_digest_many(const char *digest_alg, const struct iovec data[], size_t n_data, void **ret_digest, size_t *ret_digest_size);
+
+static inline int openssl_digest(const char *digest_alg, const void *buf, size_t len, void **ret_digest, size_t *ret_digest_size) {
+ return openssl_digest_many(digest_alg, &IOVEC_MAKE((void*) buf, len), 1, ret_digest, ret_digest_size);
+}
+
+int openssl_hmac_many(const char *digest_alg, const void *key, size_t key_size, const struct iovec data[], size_t n_data, void **ret_digest, size_t *ret_digest_size);
+
+static inline int openssl_hmac(const char *digest_alg, const void *key, size_t key_size, const void *buf, size_t len, void **ret_digest, size_t *ret_digest_size) {
+ return openssl_hmac_many(digest_alg, key, key_size, &IOVEC_MAKE((void*) buf, len), 1, ret_digest, ret_digest_size);
+}
+
+int openssl_cipher_many(const char *alg, size_t bits, const char *mode, const void *key, size_t key_size, const void *iv, size_t iv_size, const struct iovec data[], size_t n_data, void **ret, size_t *ret_size);
+
+static inline int openssl_cipher(const char *alg, size_t bits, const char *mode, const void *key, size_t key_size, const void *iv, size_t iv_size, const void *buf, size_t len, void **ret, size_t *ret_size) {
+ return openssl_cipher_many(alg, bits, mode, key, key_size, iv, iv_size, &IOVEC_MAKE((void*) buf, len), 1, ret, ret_size);
+}
+
+int kdf_ss_derive(const char *digest, const void *key, size_t key_size, const void *salt, size_t salt_size, const void *info, size_t info_size, size_t derive_size, void **ret);
+
+int kdf_kb_hmac_derive(const char *mode, const char *digest, const void *key, size_t key_size, const void *salt, size_t salt_size, const void *info, size_t info_size, const void *seed, size_t seed_size, size_t derive_size, void **ret);
+
+int rsa_encrypt_bytes(EVP_PKEY *pkey, const void *decrypted_key, size_t decrypted_key_size, void **ret_encrypt_key, size_t *ret_encrypt_key_size);
+
+int rsa_oaep_encrypt_bytes(const EVP_PKEY *pkey, const char *digest_alg, const char *label, const void *decrypted_key, size_t decrypted_key_size, void **ret_encrypt_key, size_t *ret_encrypt_key_size);
+
+int rsa_pkey_to_suitable_key_size(EVP_PKEY *pkey, size_t *ret_suitable_key_size);
+
+int rsa_pkey_new(size_t bits, EVP_PKEY **ret);
+
+int rsa_pkey_from_n_e(const void *n, size_t n_size, const void *e, size_t e_size, EVP_PKEY **ret);
+
+int rsa_pkey_to_n_e(const EVP_PKEY *pkey, void **ret_n, size_t *ret_n_size, void **ret_e, size_t *ret_e_size);
+
+int ecc_pkey_from_curve_x_y(int curve_id, const void *x, size_t x_size, const void *y, size_t y_size, EVP_PKEY **ret);
+
+int ecc_pkey_to_curve_x_y(const EVP_PKEY *pkey, int *ret_curve_id, void **ret_x, size_t *ret_x_size, void **ret_y, size_t *ret_y_size);
+
+int ecc_pkey_new(int curve_id, EVP_PKEY **ret);
+
+int ecc_ecdh(const EVP_PKEY *private_pkey, const EVP_PKEY *peer_pkey, void **ret_shared_secret, size_t *ret_shared_secret_size);
+
+int pubkey_fingerprint(EVP_PKEY *pk, const EVP_MD *md, void **ret, size_t *ret_size);
+
+int digest_and_sign(const EVP_MD *md, EVP_PKEY *privkey, const void *data, size_t size, void **ret, size_t *ret_size);
+
+#else
+
+typedef struct X509 X509;
+typedef struct EVP_PKEY EVP_PKEY;
+
+static inline void *X509_free(X509 *p) {
+ assert(p == NULL);
+ return NULL;
+}
+
+static inline void *EVP_PKEY_free(EVP_PKEY *p) {
+ assert(p == NULL);
+ return NULL;
+}
+
+#endif
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(X509*, X509_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(EVP_PKEY*, EVP_PKEY_free, NULL);
+
+int x509_fingerprint(X509 *cert, uint8_t buffer[static X509_FINGERPRINT_SIZE]);
+
+#if PREFER_OPENSSL
+/* The openssl definition */
+typedef const EVP_MD* hash_md_t;
+typedef const EVP_MD* hash_algorithm_t;
+typedef int elliptic_curve_t;
+typedef EVP_MD_CTX* hash_context_t;
+# define OPENSSL_OR_GCRYPT(a, b) (a)
+
+#elif HAVE_GCRYPT
+
+# include <gcrypt.h>
+
+/* The gcrypt definition */
+typedef int hash_md_t;
+typedef const char* hash_algorithm_t;
+typedef const char* elliptic_curve_t;
+typedef gcry_md_hd_t hash_context_t;
+# define OPENSSL_OR_GCRYPT(a, b) (b)
+#endif
+
+#if PREFER_OPENSSL
+int string_hashsum(const char *s, size_t len, const char *md_algorithm, char **ret);
+
+static inline int string_hashsum_sha224(const char *s, size_t len, char **ret) {
+ return string_hashsum(s, len, "SHA224", ret);
+}
+
+static inline int string_hashsum_sha256(const char *s, size_t len, char **ret) {
+ return string_hashsum(s, len, "SHA256", ret);
+}
+#endif
diff --git a/src/shared/output-mode.c b/src/shared/output-mode.c
new file mode 100644
index 0000000..026bf19
--- /dev/null
+++ b/src/shared/output-mode.c
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "output-mode.h"
+#include "string-table.h"
+
+JsonFormatFlags output_mode_to_json_format_flags(OutputMode m) {
+
+ switch (m) {
+
+ case OUTPUT_JSON_SSE:
+ return JSON_FORMAT_SSE;
+
+ case OUTPUT_JSON_SEQ:
+ return JSON_FORMAT_SEQ;
+
+ case OUTPUT_JSON_PRETTY:
+ return JSON_FORMAT_PRETTY;
+
+ default:
+ return JSON_FORMAT_NEWLINE;
+ }
+}
+
+static const char *const output_mode_table[_OUTPUT_MODE_MAX] = {
+ [OUTPUT_SHORT] = "short",
+ [OUTPUT_SHORT_FULL] = "short-full",
+ [OUTPUT_SHORT_ISO] = "short-iso",
+ [OUTPUT_SHORT_ISO_PRECISE] = "short-iso-precise",
+ [OUTPUT_SHORT_PRECISE] = "short-precise",
+ [OUTPUT_SHORT_MONOTONIC] = "short-monotonic",
+ [OUTPUT_SHORT_DELTA] = "short-delta",
+ [OUTPUT_SHORT_UNIX] = "short-unix",
+ [OUTPUT_VERBOSE] = "verbose",
+ [OUTPUT_EXPORT] = "export",
+ [OUTPUT_JSON] = "json",
+ [OUTPUT_JSON_PRETTY] = "json-pretty",
+ [OUTPUT_JSON_SSE] = "json-sse",
+ [OUTPUT_JSON_SEQ] = "json-seq",
+ [OUTPUT_CAT] = "cat",
+ [OUTPUT_WITH_UNIT] = "with-unit",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(output_mode, OutputMode);
diff --git a/src/shared/output-mode.h b/src/shared/output-mode.h
new file mode 100644
index 0000000..8683f57
--- /dev/null
+++ b/src/shared/output-mode.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+#include "macro.h"
+
+typedef enum OutputMode {
+ OUTPUT_SHORT,
+ OUTPUT_SHORT_FULL,
+ OUTPUT_SHORT_ISO,
+ OUTPUT_SHORT_ISO_PRECISE,
+ OUTPUT_SHORT_PRECISE,
+ OUTPUT_SHORT_MONOTONIC,
+ OUTPUT_SHORT_DELTA,
+ OUTPUT_SHORT_UNIX,
+ OUTPUT_VERBOSE,
+ OUTPUT_EXPORT,
+ OUTPUT_JSON,
+ OUTPUT_JSON_PRETTY,
+ OUTPUT_JSON_SSE,
+ OUTPUT_JSON_SEQ,
+ OUTPUT_CAT,
+ OUTPUT_WITH_UNIT,
+ _OUTPUT_MODE_MAX,
+ _OUTPUT_MODE_INVALID = -EINVAL,
+} OutputMode;
+
+static inline bool OUTPUT_MODE_IS_JSON(OutputMode m) {
+ return IN_SET(m, OUTPUT_JSON, OUTPUT_JSON_PRETTY, OUTPUT_JSON_SSE, OUTPUT_JSON_SEQ);
+}
+
+/* The output flags definitions are shared by the logs and process tree output. Some apply to both, some only to the
+ * logs output, others only to the process tree output. */
+
+typedef enum OutputFlags {
+ OUTPUT_SHOW_ALL = 1 << 0,
+ OUTPUT_FULL_WIDTH = 1 << 1,
+ OUTPUT_COLOR = 1 << 2,
+
+ /* Specific to log output */
+ OUTPUT_WARN_CUTOFF = 1 << 3,
+ OUTPUT_CATALOG = 1 << 4,
+ OUTPUT_BEGIN_NEWLINE = 1 << 5,
+ OUTPUT_UTC = 1 << 6,
+ OUTPUT_NO_HOSTNAME = 1 << 7,
+ OUTPUT_TRUNCATE_NEWLINE = 1 << 8,
+
+ /* Specific to process tree output */
+ OUTPUT_KERNEL_THREADS = 1 << 9,
+ OUTPUT_CGROUP_XATTRS = 1 << 10,
+ OUTPUT_CGROUP_ID = 1 << 11,
+} OutputFlags;
+
+JsonFormatFlags output_mode_to_json_format_flags(OutputMode m);
+
+const char* output_mode_to_string(OutputMode m) _const_;
+OutputMode output_mode_from_string(const char *s) _pure_;
diff --git a/src/shared/pager.c b/src/shared/pager.c
new file mode 100644
index 0000000..19deefa
--- /dev/null
+++ b/src/shared/pager.c
@@ -0,0 +1,330 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+
+#include "sd-login.h"
+
+#include "copy.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "io-util.h"
+#include "locale-util.h"
+#include "log.h"
+#include "macro.h"
+#include "pager.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "signal-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+
+static pid_t pager_pid = 0;
+
+static int stored_stdout = -1;
+static int stored_stderr = -1;
+static bool stdout_redirected = false;
+static bool stderr_redirected = false;
+
+_noreturn_ static void pager_fallback(void) {
+ int r;
+
+ r = copy_bytes(STDIN_FILENO, STDOUT_FILENO, UINT64_MAX, 0);
+ if (r < 0) {
+ log_error_errno(r, "Internal pager failed: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ _exit(EXIT_SUCCESS);
+}
+
+static int no_quit_on_interrupt(int exe_name_fd, const char *less_opts) {
+ _cleanup_fclose_ FILE *file = NULL;
+ _cleanup_free_ char *line = NULL;
+ int r;
+
+ assert(exe_name_fd >= 0);
+ assert(less_opts);
+
+ /* This takes ownership of exe_name_fd */
+ file = fdopen(exe_name_fd, "r");
+ if (!file) {
+ safe_close(exe_name_fd);
+ return log_error_errno(errno, "Failed to create FILE object: %m");
+ }
+
+ /* Find the last line */
+ for (;;) {
+ _cleanup_free_ char *t = NULL;
+
+ r = read_line(file, LONG_LINE_MAX, &t);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read from socket: %m");
+ if (r == 0)
+ break;
+
+ free_and_replace(line, t);
+ }
+
+ /* We only treat "less" specially.
+ * Return true whenever option K is *not* set. */
+ r = streq_ptr(line, "less") && !strchr(less_opts, 'K');
+
+ log_debug("Pager executable is \"%s\", options \"%s\", quit_on_interrupt: %s",
+ strnull(line), less_opts, yes_no(!r));
+ return r;
+}
+
+void pager_open(PagerFlags flags) {
+ _cleanup_close_pair_ int fd[2] = EBADF_PAIR, exe_name_pipe[2] = EBADF_PAIR;
+ _cleanup_strv_free_ char **pager_args = NULL;
+ _cleanup_free_ char *l = NULL;
+ const char *pager, *less_opts;
+ int r;
+
+ if (flags & PAGER_DISABLE)
+ return;
+
+ if (pager_pid > 0)
+ return;
+
+ if (terminal_is_dumb())
+ return;
+
+ if (!is_main_thread())
+ return (void) log_error_errno(SYNTHETIC_ERRNO(EPERM), "Pager invoked from wrong thread.");
+
+ pager = getenv("SYSTEMD_PAGER");
+ if (!pager)
+ pager = getenv("PAGER");
+
+ if (pager) {
+ pager_args = strv_split(pager, WHITESPACE);
+ if (!pager_args)
+ return (void) log_oom();
+
+ /* If the pager is explicitly turned off, honour it */
+ if (strv_isempty(pager_args) || strv_equal(pager_args, STRV_MAKE("cat")))
+ return;
+ }
+
+ /* Determine and cache number of columns/lines before we spawn the pager so that we get the value from the
+ * actual tty */
+ (void) columns();
+ (void) lines();
+
+ if (pipe2(fd, O_CLOEXEC) < 0)
+ return (void) log_error_errno(errno, "Failed to create pager pipe: %m");
+
+ /* This is a pipe to feed the name of the executed pager binary into the parent */
+ if (pipe2(exe_name_pipe, O_CLOEXEC) < 0)
+ return (void) log_error_errno(errno, "Failed to create exe_name pipe: %m");
+
+ /* Initialize a good set of less options */
+ less_opts = getenv("SYSTEMD_LESS");
+ if (!less_opts)
+ less_opts = "FRSXMK";
+ if (flags & PAGER_JUMP_TO_END) {
+ l = strjoin(less_opts, " +G");
+ if (!l)
+ return (void) log_oom();
+ less_opts = l;
+ }
+
+ /* We set SIGINT as PR_DEATHSIG signal here, to match the "K" parameter we set in $LESS, which enables SIGINT behaviour. */
+ r = safe_fork("(pager)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGINT|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pager_pid);
+ if (r < 0)
+ return;
+ if (r == 0) {
+ const char *less_charset;
+
+ /* In the child start the pager */
+
+ if (dup2(fd[0], STDIN_FILENO) < 0) {
+ log_error_errno(errno, "Failed to duplicate file descriptor to STDIN: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ safe_close_pair(fd);
+
+ if (setenv("LESS", less_opts, 1) < 0) {
+ log_error_errno(errno, "Failed to set environment variable LESS: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ /* Initialize a good charset for less. This is particularly important if we output UTF-8
+ * characters. */
+ less_charset = getenv("SYSTEMD_LESSCHARSET");
+ if (!less_charset && is_locale_utf8())
+ less_charset = "utf-8";
+ if (less_charset &&
+ setenv("LESSCHARSET", less_charset, 1) < 0) {
+ log_error_errno(errno, "Failed to set environment variable LESSCHARSET: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ /* People might invoke us from sudo, don't needlessly allow less to be a way to shell out
+ * privileged stuff. If the user set $SYSTEMD_PAGERSECURE, trust their configuration of the
+ * pager. If they didn't, use secure mode when under euid is changed. If $SYSTEMD_PAGERSECURE
+ * wasn't explicitly set, and we autodetect the need for secure mode, only use the pager we
+ * know to be good. */
+ int use_secure_mode = getenv_bool_secure("SYSTEMD_PAGERSECURE");
+ bool trust_pager = use_secure_mode >= 0;
+ if (use_secure_mode == -ENXIO) {
+ uid_t uid;
+
+ r = sd_pid_get_owner_uid(0, &uid);
+ if (r < 0)
+ log_debug_errno(r, "sd_pid_get_owner_uid() failed, enabling pager secure mode: %m");
+
+ use_secure_mode = r < 0 || uid != geteuid();
+
+ } else if (use_secure_mode < 0) {
+ log_warning_errno(use_secure_mode, "Unable to parse $SYSTEMD_PAGERSECURE, assuming true: %m");
+ use_secure_mode = true;
+ }
+
+ /* We generally always set variables used by less, even if we end up using a different pager.
+ * They shouldn't hurt in any case, and ideally other pagers would look at them too. */
+ r = set_unset_env("LESSSECURE", use_secure_mode ? "1" : NULL, true);
+ if (r < 0) {
+ log_error_errno(r, "Failed to adjust environment variable LESSSECURE: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (trust_pager && pager_args) { /* The pager config might be set globally, and we cannot
+ * know if the user adjusted it to be appropriate for the
+ * secure mode. Thus, start the pager specified through
+ * envvars only when $SYSTEMD_PAGERSECURE was explicitly set
+ * as well. */
+ r = loop_write(exe_name_pipe[1], pager_args[0], strlen(pager_args[0]) + 1);
+ if (r < 0) {
+ log_error_errno(r, "Failed to write pager name to socket: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ execvp(pager_args[0], pager_args);
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to execute '%s', using fallback pagers: %m", pager_args[0]);
+ }
+
+ /* Debian's alternatives command for pagers is called 'pager'. Note that we do not call
+ * sensible-pagers here, since that is just a shell script that implements a logic that is
+ * similar to this one anyway, but is Debian-specific. */
+ static const char* pagers[] = { "pager", "less", "more", "(built-in)" };
+
+ for (unsigned i = 0; i < ELEMENTSOF(pagers); i++) {
+ /* Only less (and our trivial fallback) implement secure mode right now. */
+ if (use_secure_mode && !STR_IN_SET(pagers[i], "less", "(built-in)"))
+ continue;
+
+ r = loop_write(exe_name_pipe[1], pagers[i], strlen(pagers[i]) + 1);
+ if (r < 0) {
+ log_error_errno(r, "Failed to write pager name to socket: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (i < ELEMENTSOF(pagers) - 1) {
+ execlp(pagers[i], pagers[i], NULL);
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to execute '%s', will try '%s' next: %m", pagers[i], pagers[i+1]);
+ } else {
+ /* Close pipe to signal the parent to start sending data */
+ safe_close_pair(exe_name_pipe);
+ pager_fallback();
+ assert_not_reached();
+ }
+ }
+ }
+
+ /* Return in the parent */
+ stored_stdout = fcntl(STDOUT_FILENO, F_DUPFD_CLOEXEC, 3);
+ if (dup2(fd[1], STDOUT_FILENO) < 0) {
+ stored_stdout = safe_close(stored_stdout);
+ return (void) log_error_errno(errno, "Failed to duplicate pager pipe: %m");
+ }
+ stdout_redirected = true;
+
+ stored_stderr = fcntl(STDERR_FILENO, F_DUPFD_CLOEXEC, 3);
+ if (dup2(fd[1], STDERR_FILENO) < 0) {
+ stored_stderr = safe_close(stored_stderr);
+ return (void) log_error_errno(errno, "Failed to duplicate pager pipe: %m");
+ }
+ stderr_redirected = true;
+
+ exe_name_pipe[1] = safe_close(exe_name_pipe[1]);
+
+ r = no_quit_on_interrupt(TAKE_FD(exe_name_pipe[0]), less_opts);
+ if (r > 0)
+ (void) ignore_signals(SIGINT);
+}
+
+void pager_close(void) {
+
+ if (pager_pid <= 0)
+ return;
+
+ /* Inform pager that we are done */
+ (void) fflush(stdout);
+ if (stdout_redirected)
+ if (stored_stdout < 0 || dup2(stored_stdout, STDOUT_FILENO) < 0)
+ (void) close(STDOUT_FILENO);
+ stored_stdout = safe_close(stored_stdout);
+ (void) fflush(stderr);
+ if (stderr_redirected)
+ if (stored_stderr < 0 || dup2(stored_stderr, STDERR_FILENO) < 0)
+ (void) close(STDERR_FILENO);
+ stored_stderr = safe_close(stored_stderr);
+ stdout_redirected = stderr_redirected = false;
+
+ (void) kill(pager_pid, SIGCONT);
+ (void) wait_for_terminate(TAKE_PID(pager_pid), NULL);
+ pager_pid = 0;
+}
+
+bool pager_have(void) {
+ return pager_pid > 0;
+}
+
+int show_man_page(const char *desc, bool null_stdio) {
+ const char *args[4] = { "man", NULL, NULL, NULL };
+ char *e = NULL;
+ pid_t pid;
+ size_t k;
+ int r;
+
+ k = strlen(desc);
+
+ if (desc[k-1] == ')')
+ e = strrchr(desc, '(');
+
+ if (e) {
+ char *page = NULL, *section = NULL;
+
+ page = strndupa_safe(desc, e - desc);
+ section = strndupa_safe(e + 1, desc + k - e - 2);
+
+ args[1] = section;
+ args[2] = page;
+ } else
+ args[1] = desc;
+
+ r = safe_fork("(man)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM|(null_stdio ? FORK_REARRANGE_STDIO : 0)|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ /* Child */
+ execvp(args[0], (char**) args);
+ log_error_errno(errno, "Failed to execute man: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ return wait_for_terminate_and_check(NULL, pid, 0);
+}
diff --git a/src/shared/pager.h b/src/shared/pager.h
new file mode 100644
index 0000000..9a9d4c5
--- /dev/null
+++ b/src/shared/pager.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+typedef enum PagerFlags {
+ PAGER_DISABLE = 1 << 0,
+ PAGER_JUMP_TO_END = 1 << 1,
+} PagerFlags;
+
+void pager_open(PagerFlags flags);
+void pager_close(void);
+bool pager_have(void) _pure_;
+
+int show_man_page(const char *page, bool null_stdio);
diff --git a/src/shared/pam-util.c b/src/shared/pam-util.c
new file mode 100644
index 0000000..f5814ef
--- /dev/null
+++ b/src/shared/pam-util.c
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <security/pam_ext.h>
+#include <syslog.h>
+#include <stdlib.h>
+
+#include "alloc-util.h"
+#include "bus-internal.h"
+#include "errno-util.h"
+#include "format-util.h"
+#include "macro.h"
+#include "pam-util.h"
+#include "process-util.h"
+#include "stdio-util.h"
+#include "string-util.h"
+
+int pam_syslog_errno(pam_handle_t *handle, int level, int error, const char *format, ...) {
+ va_list ap;
+
+ LOCAL_ERRNO(error);
+
+ va_start(ap, format);
+ pam_vsyslog(handle, LOG_ERR, format, ap);
+ va_end(ap);
+
+ return error == -ENOMEM ? PAM_BUF_ERR : PAM_SERVICE_ERR;
+}
+
+int pam_syslog_pam_error(pam_handle_t *handle, int level, int error, const char *format, ...) {
+ /* This wraps pam_syslog() but will replace @PAMERR@ with a string from pam_strerror().
+ * @PAMERR@ must be at the very end. */
+
+ va_list ap;
+ va_start(ap, format);
+
+ const char *p = endswith(format, "@PAMERR@");
+ if (p) {
+ const char *pamerr = pam_strerror(handle, error);
+ if (strchr(pamerr, '%'))
+ pamerr = "n/a"; /* We cannot have any formatting chars */
+
+ char buf[p - format + strlen(pamerr) + 1];
+ xsprintf(buf, "%.*s%s", (int)(p - format), format, pamerr);
+
+ DISABLE_WARNING_FORMAT_NONLITERAL;
+ pam_vsyslog(handle, level, buf, ap);
+ REENABLE_WARNING;
+ } else
+ pam_vsyslog(handle, level, format, ap);
+
+ va_end(ap);
+
+ return error;
+}
+
+/* A small structure we store inside the PAM session object, that allows us to reuse bus connections but pins
+ * it to the process thoroughly. */
+struct PamBusData {
+ sd_bus *bus;
+ pam_handle_t *pam_handle;
+ char *cache_id;
+};
+
+static PamBusData *pam_bus_data_free(PamBusData *d) {
+ /* The actual destructor */
+ if (!d)
+ return NULL;
+
+ /* NB: PAM sessions usually involve forking off a child process, and thus the PAM context might be
+ * duplicated in the child. This destructor might be called twice: both in the parent and in the
+ * child. sd_bus_flush_close_unref() however is smart enough to be a NOP when invoked in any other
+ * process than the one it was invoked from, hence we don't need to add any extra protection here to
+ * ensure that destruction of the bus connection in the child affects the parent's connection
+ * somehow. */
+ sd_bus_flush_close_unref(d->bus);
+ free(d->cache_id);
+
+ /* Note: we don't destroy pam_handle here, because this object is pinned by the handle, and not vice versa! */
+
+ return mfree(d);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(PamBusData*, pam_bus_data_free);
+
+static void pam_bus_data_destroy(pam_handle_t *handle, void *data, int error_status) {
+ /* Destructor when called from PAM. Note that error_status is supposed to tell us via PAM_DATA_SILENT
+ * whether we are called in a forked off child of the PAM session or in the original parent. We don't
+ * bother with that however, and instead rely on the PID checks that sd_bus_flush_close_unref() does
+ * internally anyway. That said, we still generate a warning message, since this really shouldn't
+ * happen. */
+
+ if (!data)
+ return;
+
+ PamBusData *d = data;
+ if (FLAGS_SET(error_status, PAM_DATA_SILENT) &&
+ d->bus && bus_origin_changed(d->bus))
+ /* Please adjust test/units/end.sh when updating the log message. */
+ pam_syslog(handle, LOG_DEBUG, "Attempted to close sd-bus after fork whose connection is opened before the fork, this should not happen.");
+
+ pam_bus_data_free(data);
+}
+
+static char* pam_make_bus_cache_id(const char *module_name) {
+ char *id;
+
+ /* We want to cache bus connections between hooks. But we don't want to allow them to be reused in
+ * child processes (because sd-bus doesn't support that). We also don't want them to be reused
+ * between our own PAM modules, because they might be linked against different versions of our
+ * utility functions and share different state. Hence include both a module ID and a PID in the data
+ * field ID. */
+
+ if (asprintf(&id, "system-bus-%s-" PID_FMT, ASSERT_PTR(module_name), getpid_cached()) < 0)
+ return NULL;
+
+ return id;
+}
+
+void pam_bus_data_disconnectp(PamBusData **_d) {
+ PamBusData *d = *ASSERT_PTR(_d);
+ pam_handle_t *handle;
+ int r;
+
+ /* Disconnects the connection explicitly (for use via _cleanup_()) when called */
+
+ if (!d)
+ return;
+
+ handle = ASSERT_PTR(d->pam_handle); /* Keep a reference to the session even after 'd' might be invalidated */
+
+ r = pam_set_data(handle, ASSERT_PTR(d->cache_id), NULL, NULL);
+ if (r != PAM_SUCCESS)
+ pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to release PAM user record data, ignoring: @PAMERR@");
+
+ /* Note, the pam_set_data() call will invalidate 'd', don't access here anymore */
+}
+
+int pam_acquire_bus_connection(
+ pam_handle_t *handle,
+ const char *module_name,
+ sd_bus **ret_bus,
+ PamBusData **ret_pam_bus_data) {
+
+ _cleanup_(pam_bus_data_freep) PamBusData *d = NULL;
+ _cleanup_free_ char *cache_id = NULL;
+ int r;
+
+ assert(handle);
+ assert(module_name);
+ assert(ret_bus);
+
+ cache_id = pam_make_bus_cache_id(module_name);
+ if (!cache_id)
+ return pam_log_oom(handle);
+
+ /* We cache the bus connection so that we can share it between the session and the authentication hooks */
+ r = pam_get_data(handle, cache_id, (const void**) &d);
+ if (r == PAM_SUCCESS && d)
+ goto success;
+ if (!IN_SET(r, PAM_SUCCESS, PAM_NO_MODULE_DATA))
+ return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to get bus connection: @PAMERR@");
+
+ d = new(PamBusData, 1);
+ if (!d)
+ return pam_log_oom(handle);
+
+ *d = (PamBusData) {
+ .cache_id = TAKE_PTR(cache_id),
+ .pam_handle = handle,
+ };
+
+ r = sd_bus_open_system(&d->bus);
+ if (r < 0)
+ return pam_syslog_errno(handle, LOG_ERR, r, "Failed to connect to system bus: %m");
+
+ r = pam_set_data(handle, d->cache_id, d, pam_bus_data_destroy);
+ if (r != PAM_SUCCESS)
+ return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to set PAM bus data: @PAMERR@");
+
+success:
+ *ret_bus = sd_bus_ref(d->bus);
+
+ if (ret_pam_bus_data)
+ *ret_pam_bus_data = d;
+
+ TAKE_PTR(d); /* don't auto-destroy anymore, it's installed now */
+
+ return PAM_SUCCESS;
+}
+
+int pam_release_bus_connection(pam_handle_t *handle, const char *module_name) {
+ _cleanup_free_ char *cache_id = NULL;
+ int r;
+
+ assert(module_name);
+
+ cache_id = pam_make_bus_cache_id(module_name);
+ if (!cache_id)
+ return pam_log_oom(handle);
+
+ r = pam_set_data(handle, cache_id, NULL, NULL);
+ if (r != PAM_SUCCESS)
+ return pam_syslog_pam_error(handle, LOG_ERR, r, "Failed to release PAM user record data: @PAMERR@");
+
+ return PAM_SUCCESS;
+}
+
+void pam_cleanup_free(pam_handle_t *handle, void *data, int error_status) {
+ /* A generic destructor for pam_set_data() that just frees the specified data */
+ free(data);
+}
diff --git a/src/shared/pam-util.h b/src/shared/pam-util.h
new file mode 100644
index 0000000..5a05fb7
--- /dev/null
+++ b/src/shared/pam-util.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <security/pam_modules.h>
+
+#include "sd-bus.h"
+
+int pam_syslog_errno(pam_handle_t *handle, int level, int error, const char *format, ...) _printf_(4,5);
+
+int pam_syslog_pam_error(pam_handle_t *handle, int level, int error, const char *format, ...) _printf_(4,5);
+
+/* Call pam_vsyslog if debug is enabled */
+#define pam_debug_syslog(handle, debug, fmt, ...) ({ \
+ if (debug) \
+ pam_syslog(handle, LOG_DEBUG, fmt, ## __VA_ARGS__); \
+ })
+
+static inline int pam_log_oom(pam_handle_t *handle) {
+ /* This is like log_oom(), but uses PAM logging */
+ return pam_syslog_errno(handle, LOG_ERR, ENOMEM, "Out of memory.");
+}
+
+static inline int pam_bus_log_create_error(pam_handle_t *handle, int r) {
+ /* This is like bus_log_create_error(), but uses PAM logging */
+ return pam_syslog_errno(handle, LOG_ERR, r, "Failed to create bus message: %m");
+}
+
+static inline int pam_bus_log_parse_error(pam_handle_t *handle, int r) {
+ /* This is like bus_log_parse_error(), but uses PAM logging */
+ return pam_syslog_errno(handle, LOG_ERR, r, "Failed to parse bus message: %m");
+}
+
+typedef struct PamBusData PamBusData;
+void pam_bus_data_disconnectp(PamBusData **d);
+
+/* Use a different module name per different PAM module. They are all loaded in the same namespace, and this
+ * helps avoid a clash in the internal data structures of sd-bus. It will be used as key for cache items. */
+int pam_acquire_bus_connection(pam_handle_t *handle, const char *module_name, sd_bus **ret_bus, PamBusData **ret_bus_data);
+int pam_release_bus_connection(pam_handle_t *handle, const char *module_name);
+
+void pam_cleanup_free(pam_handle_t *handle, void *data, int error_status);
diff --git a/src/shared/parse-argument.c b/src/shared/parse-argument.c
new file mode 100644
index 0000000..145bd11
--- /dev/null
+++ b/src/shared/parse-argument.c
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "format-table.h"
+#include "parse-argument.h"
+#include "path-util.h"
+#include "signal-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+
+/* All functions in this file emit warnings. */
+
+int parse_boolean_argument(const char *optname, const char *s, bool *ret) {
+ int r;
+
+ /* Returns the result through *ret and the return value. */
+
+ if (s) {
+ r = parse_boolean(s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse boolean argument to %s: %s.", optname, s);
+
+ if (ret)
+ *ret = r;
+ return r;
+ } else {
+ /* s may be NULL. This is controlled by getopt_long() parameters. */
+ if (ret)
+ *ret = true;
+ return true;
+ }
+}
+
+int parse_json_argument(const char *s, JsonFormatFlags *ret) {
+ assert(s);
+ assert(ret);
+
+ if (streq(s, "pretty"))
+ *ret = JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR_AUTO;
+ else if (streq(s, "short"))
+ *ret = JSON_FORMAT_NEWLINE;
+ else if (streq(s, "off"))
+ *ret = JSON_FORMAT_OFF;
+ else if (streq(s, "help")) {
+ puts("pretty\n"
+ "short\n"
+ "off");
+ return 0; /* 0 means → we showed a brief help, exit now */
+ } else
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown argument to --json= switch: %s", s);
+
+ return 1; /* 1 means → properly parsed */
+}
+
+int parse_path_argument(const char *path, bool suppress_root, char **arg) {
+ char *p;
+ int r;
+
+ /*
+ * This function is intended to be used in command line parsers, to handle paths that are passed
+ * in. It makes the path absolute, and reduces it to NULL if omitted or root (the latter optionally).
+ *
+ * NOTE THAT THIS WILL FREE THE PREVIOUS ARGUMENT POINTER ON SUCCESS!
+ * Hence, do not pass in uninitialized pointers.
+ */
+
+ if (isempty(path)) {
+ *arg = mfree(*arg);
+ return 0;
+ }
+
+ r = path_make_absolute_cwd(path, &p);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse path \"%s\" and make it absolute: %m", path);
+
+ path_simplify(p);
+ if (suppress_root && empty_or_root(p))
+ p = mfree(p);
+
+ return free_and_replace(*arg, p);
+}
+
+int parse_signal_argument(const char *s, int *ret) {
+ int r;
+
+ assert(s);
+ assert(ret);
+
+ if (streq(s, "help")) {
+ DUMP_STRING_TABLE(signal, int, _NSIG);
+ return 0;
+ }
+
+ if (streq(s, "list")) {
+ _cleanup_(table_unrefp) Table *table = NULL;
+
+ table = table_new("signal", "name");
+ if (!table)
+ return log_oom();
+
+ for (int i = 1; i < _NSIG; i++) {
+ r = table_add_many(
+ table,
+ TABLE_INT, i,
+ TABLE_SIGNAL, i);
+ if (r < 0)
+ return table_log_add_error(r);
+ }
+
+ r = table_print(table, NULL);
+ if (r < 0)
+ return table_log_print_error(r);
+
+ return 0;
+ }
+
+ r = signal_from_string(s);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse signal string \"%s\".", s);
+
+ *ret = r;
+ return 1; /* work to do */
+}
diff --git a/src/shared/parse-argument.h b/src/shared/parse-argument.h
new file mode 100644
index 0000000..adad65e
--- /dev/null
+++ b/src/shared/parse-argument.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "json.h"
+
+int parse_boolean_argument(const char *optname, const char *s, bool *ret);
+int parse_json_argument(const char *s, JsonFormatFlags *ret);
+int parse_path_argument(const char *path, bool suppress_root, char **arg);
+int parse_signal_argument(const char *s, int *ret);
diff --git a/src/shared/parse-helpers.c b/src/shared/parse-helpers.c
new file mode 100644
index 0000000..9664b9c
--- /dev/null
+++ b/src/shared/parse-helpers.c
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "af-list.h"
+#include "extract-word.h"
+#include "ip-protocol-list.h"
+#include "log.h"
+#include "parse-helpers.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "utf8.h"
+
+int path_simplify_and_warn(
+ char *path,
+ unsigned flag,
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue) {
+
+ bool fatal = flag & PATH_CHECK_FATAL;
+
+ assert(!FLAGS_SET(flag, PATH_CHECK_ABSOLUTE | PATH_CHECK_RELATIVE));
+
+ if (!utf8_is_valid(path))
+ return log_syntax_invalid_utf8(unit, LOG_ERR, filename, line, path);
+
+ if (flag & (PATH_CHECK_ABSOLUTE | PATH_CHECK_RELATIVE)) {
+ bool absolute;
+
+ absolute = path_is_absolute(path);
+
+ if (!absolute && (flag & PATH_CHECK_ABSOLUTE))
+ return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL),
+ "%s= path is not absolute%s: %s",
+ lvalue, fatal ? "" : ", ignoring", path);
+
+ if (absolute && (flag & PATH_CHECK_RELATIVE))
+ return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL),
+ "%s= path is absolute%s: %s",
+ lvalue, fatal ? "" : ", ignoring", path);
+ }
+
+ path_simplify_full(path, flag & PATH_KEEP_TRAILING_SLASH ? PATH_SIMPLIFY_KEEP_TRAILING_SLASH : 0);
+
+ if (!path_is_valid(path))
+ return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL),
+ "%s= path has invalid length (%zu bytes)%s.",
+ lvalue, strlen(path), fatal ? "" : ", ignoring");
+
+ if (!path_is_normalized(path))
+ return log_syntax(unit, LOG_ERR, filename, line, SYNTHETIC_ERRNO(EINVAL),
+ "%s= path is not normalized%s: %s",
+ lvalue, fatal ? "" : ", ignoring", path);
+
+ return 0;
+}
+
+static int parse_af_token(
+ const char *token,
+ int *family,
+ int *ip_protocol,
+ uint16_t *nr_ports,
+ uint16_t *port_min) {
+
+ int af;
+
+ assert(token);
+ assert(family);
+
+ af = af_from_ipv4_ipv6(token);
+ if (af == AF_UNSPEC)
+ return -EINVAL;
+
+ *family = af;
+ return 0;
+}
+
+static int parse_ip_protocol_token(
+ const char *token,
+ int *family,
+ int *ip_protocol,
+ uint16_t *nr_ports,
+ uint16_t *port_min) {
+
+ int proto;
+
+ assert(token);
+ assert(ip_protocol);
+
+ proto = ip_protocol_from_tcp_udp(token);
+ if (proto < 0)
+ return -EINVAL;
+
+ *ip_protocol = proto;
+ return 0;
+}
+
+static int parse_ip_ports_token(
+ const char *token,
+ int *family,
+ int *ip_protocol,
+ uint16_t *nr_ports,
+ uint16_t *port_min) {
+
+ assert(token);
+ assert(nr_ports);
+ assert(port_min);
+
+ if (streq(token, "any"))
+ *nr_ports = *port_min = 0;
+ else {
+ uint16_t mn = 0, mx = 0;
+ int r = parse_ip_port_range(token, &mn, &mx);
+ if (r < 0)
+ return r;
+
+ *nr_ports = mx - mn + 1;
+ *port_min = mn;
+ }
+
+ return 0;
+}
+
+typedef int (*parse_token_f)(
+ const char *,
+ int *,
+ int *,
+ uint16_t *,
+ uint16_t *);
+
+int parse_socket_bind_item(
+ const char *str,
+ int *address_family,
+ int *ip_protocol,
+ uint16_t *nr_ports,
+ uint16_t *port_min) {
+
+ /* Order of token parsers is important. */
+ const parse_token_f parsers[] = {
+ &parse_af_token,
+ &parse_ip_protocol_token,
+ &parse_ip_ports_token,
+ };
+ parse_token_f const *parser_ptr = parsers;
+ int af = AF_UNSPEC, proto = 0, r;
+ uint16_t nr = 0, mn = 0;
+ const char *p = ASSERT_PTR(str);
+
+ assert(address_family);
+ assert(ip_protocol);
+ assert(nr_ports);
+ assert(port_min);
+
+ if (isempty(p))
+ return -EINVAL;
+
+ for (;;) {
+ _cleanup_free_ char *token = NULL;
+
+ r = extract_first_word(&p, &token, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r == 0)
+ break;
+ if (r < 0)
+ return r;
+
+ if (isempty(token))
+ return -EINVAL;
+
+ while (parser_ptr != parsers + ELEMENTSOF(parsers)) {
+ r = (*parser_ptr)(token, &af, &proto, &nr, &mn);
+ if (r == -ENOMEM)
+ return r;
+
+ ++parser_ptr;
+ /* Continue to next token if parsing succeeded,
+ * otherwise apply next parser to the same token.
+ */
+ if (r >= 0)
+ break;
+ }
+ if (parser_ptr == parsers + ELEMENTSOF(parsers))
+ break;
+ }
+
+ /* Failed to parse a token. */
+ if (r < 0)
+ return r;
+
+ /* Parsers applied successfully, but end of the string not reached. */
+ if (p)
+ return -EINVAL;
+
+ *address_family = af;
+ *ip_protocol = proto;
+ *nr_ports = nr;
+ *port_min = mn;
+ return 0;
+}
+
+int config_parse_path_or_ignore(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_free_ char *n = NULL;
+ bool fatal = ltype;
+ char **s = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue))
+ goto finalize;
+
+ n = strdup(rvalue);
+ if (!n)
+ return log_oom();
+
+ if (streq(n, "-"))
+ goto finalize;
+
+ r = path_simplify_and_warn(n, PATH_CHECK_ABSOLUTE | (fatal ? PATH_CHECK_FATAL : 0), unit, filename, line, lvalue);
+ if (r < 0)
+ return fatal ? -ENOEXEC : 0;
+
+finalize:
+ return free_and_replace(*s, n);
+}
diff --git a/src/shared/parse-helpers.h b/src/shared/parse-helpers.h
new file mode 100644
index 0000000..3e4ad3c
--- /dev/null
+++ b/src/shared/parse-helpers.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdint.h>
+
+enum {
+ PATH_CHECK_FATAL = 1 << 0, /* If not set, then error message is appended with 'ignoring'. */
+ PATH_CHECK_ABSOLUTE = 1 << 1,
+ PATH_CHECK_RELATIVE = 1 << 2,
+ PATH_KEEP_TRAILING_SLASH = 1 << 3,
+};
+
+int path_simplify_and_warn(
+ char *path,
+ unsigned flag,
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *lvalue);
+
+int parse_socket_bind_item(
+ const char *str,
+ int *address_family,
+ int *ip_protocol,
+ uint16_t *nr_ports,
+ uint16_t *port_min);
+
+int config_parse_path_or_ignore(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata);
diff --git a/src/shared/password-quality-util-passwdqc.c b/src/shared/password-quality-util-passwdqc.c
new file mode 100644
index 0000000..adfc14d
--- /dev/null
+++ b/src/shared/password-quality-util-passwdqc.c
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dlfcn-util.h"
+#include "errno-util.h"
+#include "log.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "password-quality-util.h"
+#include "strv.h"
+
+#if HAVE_PASSWDQC
+
+static void *passwdqc_dl = NULL;
+
+void (*sym_passwdqc_params_reset)(passwdqc_params_t *params);
+int (*sym_passwdqc_params_load)(passwdqc_params_t *params, char **reason, const char *pathname);
+int (*sym_passwdqc_params_parse)(passwdqc_params_t *params, char **reason, int argc, const char *const *argv);
+void (*sym_passwdqc_params_free)(passwdqc_params_t *params);
+const char *(*sym_passwdqc_check)(const passwdqc_params_qc_t *params, const char *newpass, const char *oldpass, const struct passwd *pw);
+char *(*sym_passwdqc_random)(const passwdqc_params_qc_t *params);
+
+int dlopen_passwdqc(void) {
+ return dlopen_many_sym_or_warn(
+ &passwdqc_dl, "libpasswdqc.so.1", LOG_DEBUG,
+ DLSYM_ARG(passwdqc_params_reset),
+ DLSYM_ARG(passwdqc_params_load),
+ DLSYM_ARG(passwdqc_params_parse),
+ DLSYM_ARG(passwdqc_params_free),
+ DLSYM_ARG(passwdqc_check),
+ DLSYM_ARG(passwdqc_random));
+}
+
+static int pwqc_allocate_context(passwdqc_params_t **ret) {
+
+ _cleanup_(sym_passwdqc_params_freep) passwdqc_params_t *params = NULL;
+ _cleanup_free_ char *load_reason = NULL;
+ int r;
+
+ assert(ret);
+
+ r = dlopen_passwdqc();
+ if (r < 0)
+ return r;
+
+ params = new0(passwdqc_params_t, 1);
+ if (!params)
+ return log_oom();
+
+ sym_passwdqc_params_reset(params);
+
+ r = sym_passwdqc_params_load(params, &load_reason, "/etc/passwdqc.conf");
+ if (r < 0) {
+ if (!load_reason)
+ return log_oom();
+ log_debug("Failed to load passwdqc configuration file, ignoring: %s", load_reason);
+ }
+
+ *ret = TAKE_PTR(params);
+ return 0;
+}
+
+int suggest_passwords(void) {
+
+ _cleanup_(sym_passwdqc_params_freep) passwdqc_params_t *params = NULL;
+ _cleanup_strv_free_erase_ char **suggestions = NULL;
+ _cleanup_(erase_and_freep) char *joined = NULL;
+ int r;
+
+ r = pwqc_allocate_context(&params);
+ if (r < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(r))
+ return 0;
+ return log_error_errno(r, "Failed to allocate libpasswdqc context: %m");
+ }
+
+ suggestions = new0(char*, N_SUGGESTIONS+1);
+ if (!suggestions)
+ return log_oom();
+
+ for (size_t i = 0; i < N_SUGGESTIONS; i++) {
+ suggestions[i] = sym_passwdqc_random(&params->qc);
+ if (!suggestions[i])
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to generate password, ignoring");
+ }
+
+ joined = strv_join(suggestions, " ");
+ if (!joined)
+ return log_oom();
+
+ printf("Password suggestions: %s\n", joined);
+ return 1;
+}
+
+int check_password_quality(
+ const char *password,
+ const char *old,
+ const char *username,
+ char **ret_error) {
+
+ _cleanup_(sym_passwdqc_params_freep) passwdqc_params_t *params = NULL;
+ const char *check_reason;
+ int r;
+
+ assert(password);
+
+ r = pwqc_allocate_context(&params);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to allocate libpasswdqc context: %m");
+
+ if (username) {
+ const struct passwd pw = {
+ .pw_name = (char *) username,
+ /*
+ * passwdqc_check() could use this information to check
+ * whether the password is based on the personal login information,
+ * but we cannot provide it.
+ */
+ .pw_passwd = (char *) "",
+ .pw_gecos = (char *) "",
+ .pw_dir = (char *) "",
+ .pw_shell = (char *) ""
+ };
+
+ check_reason = sym_passwdqc_check(&params->qc, password, old, &pw);
+ } else
+ check_reason = sym_passwdqc_check(&params->qc, password, old, /* pw */ NULL);
+
+ if (check_reason) {
+ if (ret_error) {
+ char *e = strdup(check_reason);
+ if (!e)
+ return log_oom();
+ *ret_error = e;
+ }
+
+ return 0; /* all bad */
+ }
+
+ return 1; /* all good */
+}
+
+#endif
diff --git a/src/shared/password-quality-util-passwdqc.h b/src/shared/password-quality-util-passwdqc.h
new file mode 100644
index 0000000..0d528d2
--- /dev/null
+++ b/src/shared/password-quality-util-passwdqc.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+#if HAVE_PASSWDQC
+#include <passwdqc.h>
+
+extern void (*sym_passwdqc_params_reset)(passwdqc_params_t *params);
+extern int (*sym_passwdqc_params_load)(passwdqc_params_t *params, char **reason, const char *pathname);
+extern int (*sym_passwdqc_params_parse)(passwdqc_params_t *params, char **reason, int argc, const char *const *argv);
+extern void (*sym_passwdqc_params_free)(passwdqc_params_t *params);
+extern const char *(*sym_passwdqc_check)(const passwdqc_params_qc_t *params, const char *newpass, const char *oldpass, const struct passwd *pw);
+extern char *(*sym_passwdqc_random)(const passwdqc_params_qc_t *params);
+
+int dlopen_passwdqc(void);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(passwdqc_params_t*, sym_passwdqc_params_free, NULL);
+
+int suggest_passwords(void);
+int check_password_quality(const char *password, const char *old, const char *username, char **ret_error);
+
+#endif
diff --git a/src/shared/password-quality-util-pwquality.c b/src/shared/password-quality-util-pwquality.c
new file mode 100644
index 0000000..80f7d58
--- /dev/null
+++ b/src/shared/password-quality-util-pwquality.c
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "dlfcn-util.h"
+#include "errno-util.h"
+#include "log.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "password-quality-util.h"
+#include "strv.h"
+
+#if HAVE_PWQUALITY
+
+static void *pwquality_dl = NULL;
+
+int (*sym_pwquality_check)(pwquality_settings_t *pwq, const char *password, const char *oldpassword, const char *user, void **auxerror);
+pwquality_settings_t *(*sym_pwquality_default_settings)(void);
+void (*sym_pwquality_free_settings)(pwquality_settings_t *pwq);
+int (*sym_pwquality_generate)(pwquality_settings_t *pwq, int entropy_bits, char **password);
+int (*sym_pwquality_get_str_value)(pwquality_settings_t *pwq, int setting, const char **value);
+int (*sym_pwquality_read_config)(pwquality_settings_t *pwq, const char *cfgfile, void **auxerror);
+int (*sym_pwquality_set_int_value)(pwquality_settings_t *pwq, int setting, int value);
+const char* (*sym_pwquality_strerror)(char *buf, size_t len, int errcode, void *auxerror);
+
+int dlopen_pwquality(void) {
+ return dlopen_many_sym_or_warn(
+ &pwquality_dl, "libpwquality.so.1", LOG_DEBUG,
+ DLSYM_ARG(pwquality_check),
+ DLSYM_ARG(pwquality_default_settings),
+ DLSYM_ARG(pwquality_free_settings),
+ DLSYM_ARG(pwquality_generate),
+ DLSYM_ARG(pwquality_get_str_value),
+ DLSYM_ARG(pwquality_read_config),
+ DLSYM_ARG(pwquality_set_int_value),
+ DLSYM_ARG(pwquality_strerror));
+}
+
+static void pwq_maybe_disable_dictionary(pwquality_settings_t *pwq) {
+ char buf[PWQ_MAX_ERROR_MESSAGE_LEN];
+ const char *path;
+ int r;
+
+ assert(pwq);
+
+ r = sym_pwquality_get_str_value(pwq, PWQ_SETTING_DICT_PATH, &path);
+ if (r < 0) {
+ log_debug("Failed to read libpwquality dictionary path, ignoring: %s",
+ sym_pwquality_strerror(buf, sizeof(buf), r, NULL));
+ return;
+ }
+
+ if (isempty(path)) {
+ log_debug("Weird, no dictionary file configured, ignoring.");
+ return;
+ }
+
+ if (access(path, F_OK) >= 0)
+ return;
+
+ if (errno != ENOENT) {
+ log_debug_errno(errno, "Failed to check if dictionary file %s exists, ignoring: %m", path);
+ return;
+ }
+
+ r = sym_pwquality_set_int_value(pwq, PWQ_SETTING_DICT_CHECK, 0);
+ if (r < 0)
+ log_debug("Failed to disable libpwquality dictionary check, ignoring: %s",
+ sym_pwquality_strerror(buf, sizeof(buf), r, NULL));
+}
+
+static int pwq_allocate_context(pwquality_settings_t **ret) {
+ _cleanup_(sym_pwquality_free_settingsp) pwquality_settings_t *pwq = NULL;
+ char buf[PWQ_MAX_ERROR_MESSAGE_LEN];
+ void *auxerror;
+ int r;
+
+ assert(ret);
+
+ r = dlopen_pwquality();
+ if (r < 0)
+ return r;
+
+ pwq = sym_pwquality_default_settings();
+ if (!pwq)
+ return -ENOMEM;
+
+ r = sym_pwquality_read_config(pwq, NULL, &auxerror);
+ if (r < 0)
+ log_debug("Failed to read libpwquality configuration, ignoring: %s",
+ sym_pwquality_strerror(buf, sizeof(buf), r, auxerror));
+
+ pwq_maybe_disable_dictionary(pwq);
+
+ *ret = TAKE_PTR(pwq);
+ return 0;
+}
+
+int suggest_passwords(void) {
+ _cleanup_(sym_pwquality_free_settingsp) pwquality_settings_t *pwq = NULL;
+ _cleanup_strv_free_erase_ char **suggestions = NULL;
+ _cleanup_(erase_and_freep) char *joined = NULL;
+ char buf[PWQ_MAX_ERROR_MESSAGE_LEN];
+ size_t i;
+ int r;
+
+ r = pwq_allocate_context(&pwq);
+ if (r < 0) {
+ if (ERRNO_IS_NOT_SUPPORTED(r))
+ return 0;
+ return log_error_errno(r, "Failed to allocate libpwquality context: %m");
+ }
+
+ suggestions = new0(char*, N_SUGGESTIONS+1);
+ if (!suggestions)
+ return log_oom();
+
+ for (i = 0; i < N_SUGGESTIONS; i++) {
+ r = sym_pwquality_generate(pwq, 64, suggestions + i);
+ if (r < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to generate password, ignoring: %s",
+ sym_pwquality_strerror(buf, sizeof(buf), r, NULL));
+ }
+
+ joined = strv_join(suggestions, " ");
+ if (!joined)
+ return log_oom();
+
+ printf("Password suggestions: %s\n", joined);
+ return 1;
+}
+
+int check_password_quality(const char *password, const char *old, const char *username, char **ret_error) {
+ _cleanup_(sym_pwquality_free_settingsp) pwquality_settings_t *pwq = NULL;
+ char buf[PWQ_MAX_ERROR_MESSAGE_LEN];
+ void *auxerror;
+ int r;
+
+ assert(password);
+
+ r = pwq_allocate_context(&pwq);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to allocate libpwquality context: %m");
+
+ r = sym_pwquality_check(pwq, password, old, username, &auxerror);
+ if (r < 0) {
+ if (ret_error) {
+ _cleanup_free_ char *e = NULL;
+
+ e = strdup(sym_pwquality_strerror(buf, sizeof(buf), r, auxerror));
+ if (!e)
+ return -ENOMEM;
+
+ *ret_error = TAKE_PTR(e);
+ }
+
+ return 0; /* all bad */
+ }
+
+ return 1; /* all good */
+}
+
+#endif
diff --git a/src/shared/password-quality-util-pwquality.h b/src/shared/password-quality-util-pwquality.h
new file mode 100644
index 0000000..a420b0d
--- /dev/null
+++ b/src/shared/password-quality-util-pwquality.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "macro.h"
+
+#if HAVE_PWQUALITY
+/* pwquality.h uses size_t but doesn't include sys/types.h on its own */
+#include <sys/types.h>
+#include <pwquality.h>
+
+extern int (*sym_pwquality_check)(pwquality_settings_t *pwq, const char *password, const char *oldpassword, const char *user, void **auxerror);
+extern pwquality_settings_t *(*sym_pwquality_default_settings)(void);
+extern void (*sym_pwquality_free_settings)(pwquality_settings_t *pwq);
+extern int (*sym_pwquality_generate)(pwquality_settings_t *pwq, int entropy_bits, char **password);
+extern int (*sym_pwquality_get_str_value)(pwquality_settings_t *pwq, int setting, const char **value);
+extern int (*sym_pwquality_read_config)(pwquality_settings_t *pwq, const char *cfgfile, void **auxerror);
+extern int (*sym_pwquality_set_int_value)(pwquality_settings_t *pwq, int setting, int value);
+extern const char* (*sym_pwquality_strerror)(char *buf, size_t len, int errcode, void *auxerror);
+
+int dlopen_pwquality(void);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(pwquality_settings_t*, sym_pwquality_free_settings, NULL);
+
+int suggest_passwords(void);
+int check_password_quality(const char *password, const char *old, const char *username, char **ret_error);
+
+#endif
diff --git a/src/shared/password-quality-util.h b/src/shared/password-quality-util.h
new file mode 100644
index 0000000..f838ba7
--- /dev/null
+++ b/src/shared/password-quality-util.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#define N_SUGGESTIONS 6
+
+#if HAVE_PASSWDQC
+
+#include "password-quality-util-passwdqc.h"
+
+#elif HAVE_PWQUALITY
+
+#include "password-quality-util-pwquality.h"
+
+#else
+
+static inline int suggest_passwords(void) {
+ return 0;
+}
+
+static inline int check_password_quality(
+ const char *password,
+ const char *old,
+ const char *username,
+ char **ret_error) {
+ if (ret_error)
+ *ret_error = NULL;
+ return 1; /* all good */
+}
+
+#endif
diff --git a/src/shared/pcre2-util.c b/src/shared/pcre2-util.c
new file mode 100644
index 0000000..578b02d
--- /dev/null
+++ b/src/shared/pcre2-util.c
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "dlfcn-util.h"
+#include "log.h"
+#include "pcre2-util.h"
+
+#if HAVE_PCRE2
+static void *pcre2_dl = NULL;
+
+pcre2_match_data* (*sym_pcre2_match_data_create)(uint32_t, pcre2_general_context *);
+void (*sym_pcre2_match_data_free)(pcre2_match_data *);
+void (*sym_pcre2_code_free)(pcre2_code *);
+pcre2_code* (*sym_pcre2_compile)(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, pcre2_compile_context *);
+int (*sym_pcre2_get_error_message)(int, PCRE2_UCHAR *, PCRE2_SIZE);
+int (*sym_pcre2_match)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *);
+PCRE2_SIZE* (*sym_pcre2_get_ovector_pointer)(pcre2_match_data *);
+
+DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(
+ pcre2_code_hash_ops_free,
+ pcre2_code,
+ (void (*)(const pcre2_code *, struct siphash*))trivial_hash_func,
+ (int (*)(const pcre2_code *, const pcre2_code*))trivial_compare_func,
+ sym_pcre2_code_free);
+#else
+const struct hash_ops pcre2_code_hash_ops_free = {};
+#endif
+
+int dlopen_pcre2(void) {
+#if HAVE_PCRE2
+ /* So here's something weird: PCRE2 actually renames the symbols exported by the library via C
+ * macros, so that the exported symbols carry a suffix "_8" but when used from C the suffix is
+ * gone. In the argument list below we ignore this mangling. Surprisingly (at least to me), we
+ * actually get away with that. That's because DLSYM_ARG() useses STRINGIFY() to generate a string
+ * version of the symbol name, and that resolves the macro mapping implicitly already, so that the
+ * string actually contains the "_8" suffix already due to that and we don't have to append it
+ * manually anymore. C is weird. 🤯 */
+
+ return dlopen_many_sym_or_warn(
+ &pcre2_dl, "libpcre2-8.so.0", LOG_ERR,
+ DLSYM_ARG(pcre2_match_data_create),
+ DLSYM_ARG(pcre2_match_data_free),
+ DLSYM_ARG(pcre2_code_free),
+ DLSYM_ARG(pcre2_compile),
+ DLSYM_ARG(pcre2_get_error_message),
+ DLSYM_ARG(pcre2_match),
+ DLSYM_ARG(pcre2_get_ovector_pointer));
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "PCRE2 support is not compiled in.");
+#endif
+}
+
+int pattern_compile_and_log(const char *pattern, PatternCompileCase case_, pcre2_code **ret) {
+#if HAVE_PCRE2
+ PCRE2_SIZE erroroffset;
+ _cleanup_(sym_pcre2_code_freep) pcre2_code *p = NULL;
+ unsigned flags = 0;
+ int errorcode, r;
+
+ assert(pattern);
+
+ r = dlopen_pcre2();
+ if (r < 0)
+ return r;
+
+ if (case_ == PATTERN_COMPILE_CASE_INSENSITIVE)
+ flags = PCRE2_CASELESS;
+ else if (case_ == PATTERN_COMPILE_CASE_AUTO) {
+ _cleanup_(sym_pcre2_match_data_freep) pcre2_match_data *md = NULL;
+ bool has_case;
+ _cleanup_(sym_pcre2_code_freep) pcre2_code *cs = NULL;
+
+ md = sym_pcre2_match_data_create(1, NULL);
+ if (!md)
+ return log_oom();
+
+ r = pattern_compile_and_log("[[:upper:]]", PATTERN_COMPILE_CASE_SENSITIVE, &cs);
+ if (r < 0)
+ return r;
+
+ r = sym_pcre2_match(cs, (PCRE2_SPTR8) pattern, PCRE2_ZERO_TERMINATED, 0, 0, md, NULL);
+ has_case = r >= 0;
+
+ flags = !has_case * PCRE2_CASELESS;
+ }
+
+ log_debug("Doing case %s matching based on %s",
+ flags & PCRE2_CASELESS ? "insensitive" : "sensitive",
+ case_ != PATTERN_COMPILE_CASE_AUTO ? "request" : "pattern casing");
+
+ p = sym_pcre2_compile((PCRE2_SPTR8) pattern,
+ PCRE2_ZERO_TERMINATED, flags, &errorcode, &erroroffset, NULL);
+ if (!p) {
+ unsigned char buf[LINE_MAX];
+
+ r = sym_pcre2_get_error_message(errorcode, buf, sizeof buf);
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Bad pattern \"%s\": %s", pattern,
+ r < 0 ? "unknown error" : (char *)buf);
+ }
+
+ if (ret)
+ *ret = TAKE_PTR(p);
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "PCRE2 support is not compiled in.");
+#endif
+}
+
+int pattern_matches_and_log(pcre2_code *compiled_pattern, const char *message, size_t size, size_t *ret_ovec) {
+#if HAVE_PCRE2
+ _cleanup_(sym_pcre2_match_data_freep) pcre2_match_data *md = NULL;
+ int r;
+
+ assert(compiled_pattern);
+ assert(message);
+ /* pattern_compile_and_log() must be called before this function is called and that function already
+ * dlopens pcre2 so we can assert on it being available here. */
+ assert(pcre2_dl);
+
+ md = sym_pcre2_match_data_create(1, NULL);
+ if (!md)
+ return log_oom();
+
+ r = sym_pcre2_match(compiled_pattern,
+ (const unsigned char *)message,
+ size,
+ 0, /* start at offset 0 in the subject */
+ 0, /* default options */
+ md,
+ NULL);
+ if (r == PCRE2_ERROR_NOMATCH)
+ return false;
+ if (r < 0) {
+ unsigned char buf[LINE_MAX];
+
+ r = sym_pcre2_get_error_message(r, buf, sizeof(buf));
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Pattern matching failed: %s",
+ r < 0 ? "unknown error" : (char*) buf);
+ }
+
+ if (ret_ovec) {
+ ret_ovec[0] = sym_pcre2_get_ovector_pointer(md)[0];
+ ret_ovec[1] = sym_pcre2_get_ovector_pointer(md)[1];
+ }
+
+ return true;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "PCRE2 support is not compiled in.");
+#endif
+}
+
+void *pattern_free(pcre2_code *p) {
+#if HAVE_PCRE2
+ if (!p)
+ return NULL;
+
+ assert(pcre2_dl);
+ sym_pcre2_code_free(p);
+ return NULL;
+#else
+ assert(p == NULL);
+ return NULL;
+#endif
+}
diff --git a/src/shared/pcre2-util.h b/src/shared/pcre2-util.h
new file mode 100644
index 0000000..f1e744d
--- /dev/null
+++ b/src/shared/pcre2-util.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "hash-funcs.h"
+#include "macro.h"
+
+#if HAVE_PCRE2
+
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
+
+extern pcre2_match_data* (*sym_pcre2_match_data_create)(uint32_t, pcre2_general_context *);
+extern void (*sym_pcre2_match_data_free)(pcre2_match_data *);
+extern void (*sym_pcre2_code_free)(pcre2_code *);
+extern pcre2_code* (*sym_pcre2_compile)(PCRE2_SPTR, PCRE2_SIZE, uint32_t, int *, PCRE2_SIZE *, pcre2_compile_context *);
+extern int (*sym_pcre2_get_error_message)(int, PCRE2_UCHAR *, PCRE2_SIZE);
+extern int (*sym_pcre2_match)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *);
+extern PCRE2_SIZE* (*sym_pcre2_get_ovector_pointer)(pcre2_match_data *);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(pcre2_match_data*, sym_pcre2_match_data_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(pcre2_code*, sym_pcre2_code_free, NULL);
+#else
+
+typedef struct {} pcre2_code;
+
+#endif
+
+extern const struct hash_ops pcre2_code_hash_ops_free;
+
+typedef enum {
+ PATTERN_COMPILE_CASE_AUTO,
+ PATTERN_COMPILE_CASE_SENSITIVE,
+ PATTERN_COMPILE_CASE_INSENSITIVE,
+ _PATTERN_COMPILE_CASE_MAX,
+ _PATTERN_COMPILE_CASE_INVALID = -EINVAL,
+} PatternCompileCase;
+
+int pattern_compile_and_log(const char *pattern, PatternCompileCase case_, pcre2_code **ret);
+int pattern_matches_and_log(pcre2_code *compiled_pattern, const char *message, size_t size, size_t *ret_ovec);
+void *pattern_free(pcre2_code *p);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(pcre2_code*, pattern_free);
+
+int dlopen_pcre2(void);
diff --git a/src/shared/pcrextend-util.c b/src/shared/pcrextend-util.c
new file mode 100644
index 0000000..fa066a4
--- /dev/null
+++ b/src/shared/pcrextend-util.c
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "sd-device.h"
+
+#include "blkid-util.h"
+#include "blockdev-util.h"
+#include "chase.h"
+#include "errno-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "mountpoint-util.h"
+#include "pcrextend-util.h"
+#include "strv.h"
+
+static int device_get_file_system_word(
+ sd_device *d,
+ const char *prefix,
+ char **ret) {
+
+#if HAVE_BLKID
+ int r;
+#endif
+
+ assert(d);
+ assert(prefix);
+ assert(ret);
+
+#if HAVE_BLKID
+ _cleanup_close_ int block_fd = sd_device_open(d, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
+ if (block_fd < 0)
+ return block_fd;
+
+ _cleanup_(blkid_free_probep) blkid_probe b = blkid_new_probe();
+ if (!b)
+ return -ENOMEM;
+
+ errno = 0;
+ r = blkid_probe_set_device(b, block_fd, 0, 0);
+ if (r != 0)
+ return errno_or_else(ENOMEM);
+
+ (void) blkid_probe_enable_superblocks(b, 1);
+ (void) blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE|BLKID_SUBLKS_UUID|BLKID_SUBLKS_LABEL);
+ (void) blkid_probe_enable_partitions(b, 1);
+ (void) blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
+
+ errno = 0;
+ r = blkid_do_safeprobe(b);
+ if (r == _BLKID_SAFEPROBE_ERROR)
+ return errno_or_else(EIO);
+ if (IN_SET(r, _BLKID_SAFEPROBE_AMBIGUOUS, _BLKID_SAFEPROBE_NOT_FOUND))
+ return -ENOPKG;
+
+ assert(r == _BLKID_SAFEPROBE_FOUND);
+
+ _cleanup_strv_free_ char **l = strv_new(prefix);
+ if (!l)
+ return -ENOMEM;
+
+ FOREACH_STRING(field, "TYPE", "UUID", "LABEL", "PART_ENTRY_UUID", "PART_ENTRY_TYPE", "PART_ENTRY_NAME") {
+ const char *v = NULL;
+
+ (void) blkid_probe_lookup_value(b, field, &v, NULL);
+
+ _cleanup_free_ char *escaped = xescape(strempty(v), ":"); /* Avoid ambiguity around ":" */
+ if (!escaped)
+ return -ENOMEM;
+
+ r = strv_consume(&l, TAKE_PTR(escaped));
+ if (r < 0)
+ return r;
+ }
+
+ assert(strv_length(l) == 7); /* We always want 7 components, to avoid ambiguous strings */
+
+ _cleanup_free_ char *word = strv_join(l, ":");
+ if (!word)
+ return -ENOMEM;
+
+ *ret = TAKE_PTR(word);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+int pcrextend_file_system_word(const char *path, char **ret_word, char **ret_normalized_path) {
+ _cleanup_free_ char *normalized_path = NULL, *normalized_escaped = NULL, *prefix = NULL, *word = NULL;
+ _cleanup_(sd_device_unrefp) sd_device *d = NULL;
+ _cleanup_close_ int dfd = -EBADF;
+ int r;
+
+ assert(path);
+ assert(ret_word);
+
+ dfd = chase_and_open(path, NULL, 0, O_DIRECTORY|O_CLOEXEC, &normalized_path);
+ if (dfd < 0)
+ return log_error_errno(dfd, "Failed to open path '%s': %m", path);
+
+ r = fd_is_mount_point(dfd, NULL, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if path '%s' is mount point: %m", normalized_path);
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTDIR), "Specified path '%s' is not a mount point, refusing: %m", normalized_path);
+
+ normalized_escaped = xescape(normalized_path, ":"); /* Avoid ambiguity around ":" */
+ if (!normalized_escaped)
+ return log_oom();
+
+ prefix = strjoin("file-system:", normalized_escaped);
+ if (!prefix)
+ return log_oom();
+
+ r = block_device_new_from_fd(dfd, BLOCK_DEVICE_LOOKUP_BACKING, &d);
+ if (r < 0) {
+ log_notice_errno(r, "Unable to determine backing block device of '%s', using generic fallback file system identity string: %m", path);
+
+ word = strjoin(prefix, "::::::");
+ if (!word)
+ return log_oom();
+ } else {
+ r = device_get_file_system_word(d, prefix, &word);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get file system identifier string for '%s': %m", path);
+ }
+
+ *ret_word = TAKE_PTR(word);
+
+ if (ret_normalized_path)
+ *ret_normalized_path = TAKE_PTR(normalized_path);
+
+ return 0;
+}
+
+int pcrextend_machine_id_word(char **ret) {
+ _cleanup_free_ char *word = NULL;
+ sd_id128_t mid;
+ int r;
+
+ assert(ret);
+
+ r = sd_id128_get_machine(&mid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire machine ID: %m");
+
+ word = strjoin("machine-id:", SD_ID128_TO_STRING(mid));
+ if (!word)
+ return log_oom();
+
+ *ret = TAKE_PTR(word);
+ return 0;
+}
diff --git a/src/shared/pcrextend-util.h b/src/shared/pcrextend-util.h
new file mode 100644
index 0000000..7dd612b
--- /dev/null
+++ b/src/shared/pcrextend-util.h
@@ -0,0 +1,5 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int pcrextend_file_system_word(const char *path, char **ret, char **ret_normalized_path);
+int pcrextend_machine_id_word(char **ret);
diff --git a/src/shared/pe-binary.c b/src/shared/pe-binary.c
new file mode 100644
index 0000000..4c05323
--- /dev/null
+++ b/src/shared/pe-binary.c
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "log.h"
+#include "pe-binary.h"
+#include "string-util.h"
+
+bool pe_header_is_64bit(const PeHeader *h) {
+ assert(h);
+
+ if (le16toh(h->optional.Magic) == UINT16_C(0x010B)) /* PE32 */
+ return false;
+
+ if (le16toh(h->optional.Magic) == UINT16_C(0x020B)) /* PE32+ */
+ return true;
+
+ assert_not_reached();
+}
+
+static size_t pe_header_size(const PeHeader *pe_header) {
+ assert(pe_header);
+
+ return offsetof(PeHeader, optional) + le16toh(pe_header->pe.SizeOfOptionalHeader);
+}
+
+const IMAGE_DATA_DIRECTORY *pe_header_get_data_directory(
+ const PeHeader *h,
+ size_t i) {
+
+ assert(h);
+
+ if (i >= le32toh(PE_HEADER_OPTIONAL_FIELD(h, NumberOfRvaAndSizes)))
+ return NULL;
+
+ return PE_HEADER_OPTIONAL_FIELD(h, DataDirectory) + i;
+}
+
+const IMAGE_SECTION_HEADER *pe_header_find_section(
+ const PeHeader *pe_header,
+ const IMAGE_SECTION_HEADER *sections,
+ const char *name) {
+
+ size_t n;
+
+ assert(pe_header);
+ assert(name);
+ assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0);
+
+ n = strlen(name);
+ if (n > sizeof(sections[0].Name)) /* Too long? */
+ return NULL;
+
+ FOREACH_ARRAY(section, sections, le16toh(pe_header->pe.NumberOfSections))
+ if (memcmp(section->Name, name, n) == 0 &&
+ memeqzero(section->Name + n, sizeof(section->Name) - n))
+ return section;
+
+ return NULL;
+}
+
+int pe_load_headers(
+ int fd,
+ IMAGE_DOS_HEADER **ret_dos_header,
+ PeHeader **ret_pe_header) {
+
+ _cleanup_free_ IMAGE_DOS_HEADER *dos_header = NULL;
+ _cleanup_free_ PeHeader *pe_header = NULL;
+ ssize_t n;
+
+ assert(fd >= 0);
+
+ dos_header = new(IMAGE_DOS_HEADER, 1);
+ if (!dos_header)
+ return log_oom_debug();
+
+ n = pread(fd,
+ dos_header,
+ sizeof(IMAGE_DOS_HEADER),
+ 0);
+ if (n < 0)
+ return log_debug_errno(errno, "Failed to read DOS header: %m");
+ if ((size_t) n != sizeof(IMAGE_DOS_HEADER))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading MZ executable header.");
+
+ if (le16toh(dos_header->e_magic) != UINT16_C(0x5A4D))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "File lacks MZ executable header.");
+
+ pe_header = new(PeHeader, 1);
+ if (!pe_header)
+ return log_oom_debug();
+
+ n = pread(fd,
+ pe_header,
+ offsetof(PeHeader, optional),
+ le32toh(dos_header->e_lfanew));
+ if (n < 0)
+ return log_debug_errno(errno, "Failed to read PE executable header: %m");
+ if ((size_t) n != offsetof(PeHeader, optional))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading PE executable header.");
+
+ if (le32toh(pe_header->signature) != UINT32_C(0x00004550))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "File lacks PE executable header.");
+
+ if (le16toh(pe_header->pe.SizeOfOptionalHeader) < sizeof_field(PeHeader, optional.Magic))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Optional header size too short for magic.");
+
+ PeHeader *pe_header_tmp = realloc(pe_header, MAX(sizeof(PeHeader), pe_header_size(pe_header)));
+ if (!pe_header_tmp)
+ return log_oom_debug();
+ pe_header = pe_header_tmp;
+
+ n = pread(fd,
+ &pe_header->optional,
+ le16toh(pe_header->pe.SizeOfOptionalHeader),
+ le32toh(dos_header->e_lfanew) + offsetof(PeHeader, optional));
+ if (n < 0)
+ return log_debug_errno(errno, "Failed to read PE executable optional header: %m");
+ if ((size_t) n != le16toh(pe_header->pe.SizeOfOptionalHeader))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading PE executable optional header.");
+
+ if (!IN_SET(le16toh(pe_header->optional.Magic), UINT16_C(0x010B), UINT16_C(0x020B)))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Optional header magic invalid.");
+
+ if (pe_header_size(pe_header) !=
+ PE_HEADER_OPTIONAL_FIELD_OFFSET(pe_header, DataDirectory) +
+ sizeof(IMAGE_DATA_DIRECTORY) * (uint64_t) le32toh(PE_HEADER_OPTIONAL_FIELD(pe_header, NumberOfRvaAndSizes)))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Optional header size mismatch.");
+
+ if (ret_dos_header)
+ *ret_dos_header = TAKE_PTR(dos_header);
+ if (ret_pe_header)
+ *ret_pe_header = TAKE_PTR(pe_header);
+
+ return 0;
+}
+
+int pe_load_sections(
+ int fd,
+ const IMAGE_DOS_HEADER *dos_header,
+ const PeHeader *pe_header,
+ IMAGE_SECTION_HEADER **ret_sections) {
+
+ _cleanup_free_ IMAGE_SECTION_HEADER *sections = NULL;
+ size_t nos;
+ ssize_t n;
+
+ assert(fd >= 0);
+ assert(dos_header);
+ assert(pe_header);
+
+ nos = le16toh(pe_header->pe.NumberOfSections);
+
+ sections = new(IMAGE_SECTION_HEADER, nos);
+ if (!sections)
+ return log_oom_debug();
+
+ n = pread(fd,
+ sections,
+ sizeof(IMAGE_SECTION_HEADER) * nos,
+ le32toh(dos_header->e_lfanew) + pe_header_size(pe_header));
+ if (n < 0)
+ return log_debug_errno(errno, "Failed to read section table: %m");
+ if ((size_t) n != sizeof(IMAGE_SECTION_HEADER) * nos)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Short read while reading section table.");
+
+ if (ret_sections)
+ *ret_sections = TAKE_PTR(sections);
+
+ return 0;
+}
+
+int pe_read_section_data(
+ int fd,
+ const PeHeader *pe_header,
+ const IMAGE_SECTION_HEADER *sections,
+ const char *name,
+ size_t max_size,
+ void **ret,
+ size_t *ret_size) {
+
+ const IMAGE_SECTION_HEADER *section;
+ _cleanup_free_ void *data = NULL;
+ size_t n;
+ ssize_t ss;
+
+ assert(fd >= 0);
+ assert(pe_header);
+ assert(sections || pe_header->pe.NumberOfSections == 0);
+ assert(name);
+
+ section = pe_header_find_section(pe_header, sections, name);
+ if (!section)
+ return -ENXIO;
+
+ n = le32toh(section->VirtualSize);
+ if (n > MIN(max_size, (size_t) SSIZE_MAX))
+ return -E2BIG;
+
+ data = malloc(n+1);
+ if (!data)
+ return -ENOMEM;
+
+ ss = pread(fd, data, n, le32toh(section->PointerToRawData));
+ if (ss < 0)
+ return -errno;
+ if ((size_t) ss != n)
+ return -EIO;
+
+ ((uint8_t*) data)[n] = 0; /* NUL terminate, no matter what */
+
+ if (ret_size)
+ *ret_size = n;
+ else {
+ /* Check that there are no embedded NUL bytes if the caller doesn't want to know the size
+ * (i.e. treats the blob as a string) */
+ const char *nul;
+
+ nul = memchr(data, 0, n);
+ if (nul && !memeqzero(nul, n - (nul - (const char*) data))) /* If there's a NUL it must only be NULs from there on */
+ return -EBADMSG;
+ }
+ if (ret)
+ *ret = TAKE_PTR(data);
+
+ return 0;
+}
+
+bool pe_is_uki(const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections) {
+ assert(pe_header);
+ assert(sections || le16toh(pe_header->pe.NumberOfSections) == 0);
+
+ if (le16toh(pe_header->optional.Subsystem) != IMAGE_SUBSYSTEM_EFI_APPLICATION)
+ return false;
+
+ return
+ pe_header_find_section(pe_header, sections, ".osrel") &&
+ pe_header_find_section(pe_header, sections, ".linux") &&
+ pe_header_find_section(pe_header, sections, ".initrd");
+}
diff --git a/src/shared/pe-binary.h b/src/shared/pe-binary.h
new file mode 100644
index 0000000..2ef44d7
--- /dev/null
+++ b/src/shared/pe-binary.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "sparse-endian.h"
+
+/* When naming things we try to stay close to the official Windows APIs as per:
+ * → https://learn.microsoft.com/en-us/windows/win32/debug/pe-format */
+
+typedef struct _packed_ _IMAGE_DOS_HEADER {
+ le16_t e_magic;
+ le16_t e_cblp;
+ le16_t e_cp;
+ le16_t e_crlc;
+ le16_t e_cparhdr;
+ le16_t e_minalloc;
+ le16_t e_maxalloc;
+ le16_t e_ss;
+ le16_t e_sp;
+ le16_t e_csum;
+ le16_t e_ip;
+ le16_t e_cs;
+ le16_t e_lfarlc;
+ le16_t e_ovno;
+ le16_t e_res[4];
+ le16_t e_oemid;
+ le16_t e_oeminfo;
+ le16_t e_res2[10];
+ le32_t e_lfanew;
+} IMAGE_DOS_HEADER;
+
+typedef struct _packed_ _IMAGE_FILE_HEADER {
+ le16_t Machine;
+ le16_t NumberOfSections;
+ le32_t TimeDateStamp;
+ le32_t PointerToSymbolTable;
+ le32_t NumberOfSymbols;
+ le16_t SizeOfOptionalHeader;
+ le16_t Characteristics;
+} IMAGE_FILE_HEADER;
+
+typedef struct _packed_ _IMAGE_DATA_DIRECTORY {
+ le32_t VirtualAddress;
+ le32_t Size;
+} IMAGE_DATA_DIRECTORY;
+
+typedef struct _packed_ _IMAGE_OPTIONAL_HEADER {
+ /* Standard fields */
+ le16_t Magic;
+ uint8_t MajorLinkerVersion;
+ uint8_t MinorLinkerVersion;
+ le32_t SizeOfCode;
+ le32_t SizeOfInitializedData;
+ le32_t SizeOfUninitializedData;
+ le32_t AddressOfEntryPoint;
+ le32_t BaseOfCode;
+
+ /* Here the PE32 and PE32+ headers differ: PE32+ has one 64bit field, PE32+ has two 32bit fields */
+ union {
+ struct {
+ le32_t BaseOfData;
+ le32_t pe32_ImageBase;
+ };
+ le64_t pe32plus_ImageBase;
+ };
+
+ /* Additional fields */
+ le32_t SectionAlignment;
+ le32_t FileAlignment;
+ le16_t MajorOperatingSystemVersion;
+ le16_t MinorOperatingSystemVersion;
+ le16_t MajorImageVersion;
+ le16_t MinorImageVersion;
+ le16_t MajorSubsystemVersion;
+ le16_t MinorSubsystemVersion;
+ le32_t Win32VersionValue;
+ le32_t SizeOfImage;
+ le32_t SizeOfHeaders;
+ le32_t CheckSum;
+ le16_t Subsystem;
+ le16_t DllCharacteristics;
+
+ /* Here similar: on PE32+ some fields are 64bit that are 32bit on PE32. */
+ union {
+ struct {
+ le32_t pe32_SizeOfStackReserve;
+ le32_t pe32_SizeOfStackCommit;
+ le32_t pe32_SizeOfHeapReserve;
+ le32_t pe32_SizeOfHeapCommit;
+ le32_t pe32_LoaderFlags;
+ le32_t pe32_NumberOfRvaAndSizes;
+ IMAGE_DATA_DIRECTORY pe32_DataDirectory[];
+ };
+ struct {
+ le64_t pe32plus_SizeOfStackReserve;
+ le64_t pe32plus_SizeOfStackCommit;
+ le64_t pe32plus_SizeOfHeapReserve;
+ le64_t pe32plus_SizeOfHeapCommit;
+ le32_t pe32plus_LoaderFlags;
+ le32_t pe32plus_NumberOfRvaAndSizes;
+ IMAGE_DATA_DIRECTORY pe32plus_DataDirectory[];
+ };
+ };
+} IMAGE_OPTIONAL_HEADER;
+
+typedef struct _packed_ PeHeader {
+ le32_t signature;
+ IMAGE_FILE_HEADER pe;
+ IMAGE_OPTIONAL_HEADER optional;
+} PeHeader;
+
+typedef struct _packed_ _IMAGE_SECTION_HEADER {
+ uint8_t Name[8];
+ le32_t VirtualSize;
+ le32_t VirtualAddress;
+ le32_t SizeOfRawData;
+ le32_t PointerToRawData;
+ le32_t PointerToRelocations;
+ le32_t PointerToLinenumbers;
+ le16_t NumberOfRelocations;
+ le16_t NumberOfLinenumbers;
+ le32_t Characteristics;
+} IMAGE_SECTION_HEADER;
+
+#define IMAGE_SUBSYSTEM_EFI_APPLICATION 10
+
+bool pe_header_is_64bit(const PeHeader *h);
+
+#define PE_HEADER_OPTIONAL_FIELD(h, field) \
+ (pe_header_is_64bit(h) ? (h)->optional.pe32plus_##field : (h)->optional.pe32_##field)
+
+#define PE_HEADER_OPTIONAL_FIELD_OFFSET(h, field) \
+ (pe_header_is_64bit(h) ? offsetof(PeHeader, optional.pe32plus_##field) : offsetof(PeHeader, optional.pe32_##field))
+
+const IMAGE_DATA_DIRECTORY *pe_header_get_data_directory(const PeHeader *h, size_t i);
+const IMAGE_SECTION_HEADER *pe_header_find_section(const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections, const char *name);
+
+int pe_load_headers(int fd, IMAGE_DOS_HEADER **ret_dos_header, PeHeader **ret_pe_header);
+
+int pe_load_sections(int fd, const IMAGE_DOS_HEADER *dos_header, const PeHeader *pe_header, IMAGE_SECTION_HEADER **ret_sections);
+int pe_read_section_data(int fd, const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections, const char *name, size_t max_size, void **ret, size_t *ret_size);
+
+bool pe_is_uki(const PeHeader *pe_header, const IMAGE_SECTION_HEADER *sections);
diff --git a/src/shared/pkcs11-util.c b/src/shared/pkcs11-util.c
new file mode 100644
index 0000000..6e88dc3
--- /dev/null
+++ b/src/shared/pkcs11-util.c
@@ -0,0 +1,1371 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "ask-password-api.h"
+#include "dlfcn-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "format-table.h"
+#include "io-util.h"
+#include "memory-util.h"
+#if HAVE_OPENSSL
+#include "openssl-util.h"
+#endif
+#include "pkcs11-util.h"
+#include "random-util.h"
+#include "string-util.h"
+#include "strv.h"
+
+bool pkcs11_uri_valid(const char *uri) {
+ const char *p;
+
+ /* A very superficial checker for RFC7512 PKCS#11 URI syntax */
+
+ if (isempty(uri))
+ return false;
+
+ p = startswith(uri, "pkcs11:");
+ if (!p)
+ return false;
+
+ if (isempty(p))
+ return false;
+
+ if (!in_charset(p, ALPHANUMERICAL ".~/-_?;&%="))
+ return false;
+
+ return true;
+}
+
+#if HAVE_P11KIT
+
+static void *p11kit_dl = NULL;
+
+char *(*sym_p11_kit_module_get_name)(CK_FUNCTION_LIST *module);
+void (*sym_p11_kit_modules_finalize_and_release)(CK_FUNCTION_LIST **modules);
+CK_FUNCTION_LIST **(*sym_p11_kit_modules_load_and_initialize)(int flags);
+const char *(*sym_p11_kit_strerror)(CK_RV rv);
+int (*sym_p11_kit_uri_format)(P11KitUri *uri, P11KitUriType uri_type, char **string);
+void (*sym_p11_kit_uri_free)(P11KitUri *uri);
+CK_ATTRIBUTE_PTR (*sym_p11_kit_uri_get_attributes)(P11KitUri *uri, CK_ULONG *n_attrs);
+CK_INFO_PTR (*sym_p11_kit_uri_get_module_info)(P11KitUri *uri);
+CK_SLOT_INFO_PTR (*sym_p11_kit_uri_get_slot_info)(P11KitUri *uri);
+CK_TOKEN_INFO_PTR (*sym_p11_kit_uri_get_token_info)(P11KitUri *uri);
+int (*sym_p11_kit_uri_match_token_info)(const P11KitUri *uri, const CK_TOKEN_INFO *token_info);
+const char *(*sym_p11_kit_uri_message)(int code);
+P11KitUri *(*sym_p11_kit_uri_new)(void);
+int (*sym_p11_kit_uri_parse)(const char *string, P11KitUriType uri_type, P11KitUri *uri);
+
+int dlopen_p11kit(void) {
+ return dlopen_many_sym_or_warn(
+ &p11kit_dl,
+ "libp11-kit.so.0", LOG_DEBUG,
+ DLSYM_ARG(p11_kit_module_get_name),
+ DLSYM_ARG(p11_kit_modules_finalize_and_release),
+ DLSYM_ARG(p11_kit_modules_load_and_initialize),
+ DLSYM_ARG(p11_kit_strerror),
+ DLSYM_ARG(p11_kit_uri_format),
+ DLSYM_ARG(p11_kit_uri_free),
+ DLSYM_ARG(p11_kit_uri_get_attributes),
+ DLSYM_ARG(p11_kit_uri_get_module_info),
+ DLSYM_ARG(p11_kit_uri_get_slot_info),
+ DLSYM_ARG(p11_kit_uri_get_token_info),
+ DLSYM_ARG(p11_kit_uri_match_token_info),
+ DLSYM_ARG(p11_kit_uri_message),
+ DLSYM_ARG(p11_kit_uri_new),
+ DLSYM_ARG(p11_kit_uri_parse));
+}
+
+int uri_from_string(const char *p, P11KitUri **ret) {
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri *uri = NULL;
+ int r;
+
+ assert(p);
+ assert(ret);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ uri = sym_p11_kit_uri_new();
+ if (!uri)
+ return -ENOMEM;
+
+ if (sym_p11_kit_uri_parse(p, P11_KIT_URI_FOR_ANY, uri) != P11_KIT_URI_OK)
+ return -EINVAL;
+
+ *ret = TAKE_PTR(uri);
+ return 0;
+}
+
+P11KitUri *uri_from_module_info(const CK_INFO *info) {
+ P11KitUri *uri;
+
+ assert(info);
+
+ if (dlopen_p11kit() < 0)
+ return NULL;
+
+ uri = sym_p11_kit_uri_new();
+ if (!uri)
+ return NULL;
+
+ *sym_p11_kit_uri_get_module_info(uri) = *info;
+ return uri;
+}
+
+P11KitUri *uri_from_slot_info(const CK_SLOT_INFO *slot_info) {
+ P11KitUri *uri;
+
+ assert(slot_info);
+
+ if (dlopen_p11kit() < 0)
+ return NULL;
+
+ uri = sym_p11_kit_uri_new();
+ if (!uri)
+ return NULL;
+
+ *sym_p11_kit_uri_get_slot_info(uri) = *slot_info;
+ return uri;
+}
+
+P11KitUri *uri_from_token_info(const CK_TOKEN_INFO *token_info) {
+ P11KitUri *uri;
+
+ assert(token_info);
+
+ if (dlopen_p11kit() < 0)
+ return NULL;
+
+ uri = sym_p11_kit_uri_new();
+ if (!uri)
+ return NULL;
+
+ *sym_p11_kit_uri_get_token_info(uri) = *token_info;
+ return uri;
+}
+
+CK_RV pkcs11_get_slot_list_malloc(
+ CK_FUNCTION_LIST *m,
+ CK_SLOT_ID **ret_slotids,
+ CK_ULONG *ret_n_slotids) {
+
+ CK_RV rv;
+
+ assert(m);
+ assert(ret_slotids);
+ assert(ret_n_slotids);
+
+ for (unsigned tries = 0; tries < 16; tries++) {
+ _cleanup_free_ CK_SLOT_ID *slotids = NULL;
+ CK_ULONG n_slotids = 0;
+
+ rv = m->C_GetSlotList(0, NULL, &n_slotids);
+ if (rv != CKR_OK)
+ return rv;
+ if (n_slotids == 0) {
+ *ret_slotids = NULL;
+ *ret_n_slotids = 0;
+ return CKR_OK;
+ }
+
+ slotids = new(CK_SLOT_ID, n_slotids);
+ if (!slotids)
+ return CKR_HOST_MEMORY;
+
+ rv = m->C_GetSlotList(0, slotids, &n_slotids);
+ if (rv == CKR_OK) {
+ *ret_slotids = TAKE_PTR(slotids);
+ *ret_n_slotids = n_slotids;
+ return CKR_OK;
+ }
+
+ if (rv != CKR_BUFFER_TOO_SMALL)
+ return rv;
+
+ /* Hu? Maybe somebody plugged something in and things changed? Let's try again */
+ }
+
+ return CKR_BUFFER_TOO_SMALL;
+}
+
+char *pkcs11_token_label(const CK_TOKEN_INFO *token_info) {
+ char *t;
+
+ /* The label is not NUL terminated and likely padded with spaces, let's make a copy here, so that we
+ * can strip that. */
+ t = strndup((char*) token_info->label, sizeof(token_info->label));
+ if (!t)
+ return NULL;
+
+ strstrip(t);
+ return t;
+}
+
+char *pkcs11_token_manufacturer_id(const CK_TOKEN_INFO *token_info) {
+ char *t;
+
+ t = strndup((char*) token_info->manufacturerID, sizeof(token_info->manufacturerID));
+ if (!t)
+ return NULL;
+
+ strstrip(t);
+ return t;
+}
+
+char *pkcs11_token_model(const CK_TOKEN_INFO *token_info) {
+ char *t;
+
+ t = strndup((char*) token_info->model, sizeof(token_info->model));
+ if (!t)
+ return NULL;
+
+ strstrip(t);
+ return t;
+}
+
+int pkcs11_token_login_by_pin(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ const CK_TOKEN_INFO *token_info,
+ const char *token_label,
+ const void *pin,
+ size_t pin_size) {
+
+ CK_RV rv;
+ int r;
+
+ assert(m);
+ assert(token_info);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(token_info->flags, CKF_PROTECTED_AUTHENTICATION_PATH)) {
+ rv = m->C_Login(session, CKU_USER, NULL, 0);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to log into security token '%s': %s", token_label, sym_p11_kit_strerror(rv));
+
+ log_info("Successfully logged into security token '%s' via protected authentication path.", token_label);
+ return 0;
+ }
+
+ if (!FLAGS_SET(token_info->flags, CKF_LOGIN_REQUIRED)) {
+ log_info("No login into security token '%s' required.", token_label);
+ return 0;
+ }
+
+ if (!pin)
+ return -ENOANO;
+
+ rv = m->C_Login(session, CKU_USER, (CK_UTF8CHAR*) pin, pin_size);
+ if (rv == CKR_OK) {
+ log_info("Successfully logged into security token '%s'.", token_label);
+ return 0;
+ }
+
+ if (rv == CKR_PIN_LOCKED)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "PIN has been locked, please reset PIN of security token '%s'.", token_label);
+ if (!IN_SET(rv, CKR_PIN_INCORRECT, CKR_PIN_LEN_RANGE))
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to log into security token '%s': %s", token_label, sym_p11_kit_strerror(rv));
+
+ return log_notice_errno(SYNTHETIC_ERRNO(ENOLCK),
+ "PIN for token '%s' is incorrect, please try again.",
+ token_label);
+}
+
+int pkcs11_token_login(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_SLOT_ID slotid,
+ const CK_TOKEN_INFO *token_info,
+ const char *friendly_name,
+ const char *icon_name,
+ const char *key_name,
+ const char *credential_name,
+ usec_t until,
+ AskPasswordFlags ask_password_flags,
+ bool headless,
+ char **ret_used_pin) {
+
+ _cleanup_free_ char *token_uri_string = NULL, *token_uri_escaped = NULL, *id = NULL, *token_label = NULL;
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri *token_uri = NULL;
+ CK_TOKEN_INFO updated_token_info;
+ int uri_result, r;
+ CK_RV rv;
+
+ assert(m);
+ assert(token_info);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ token_label = pkcs11_token_label(token_info);
+ if (!token_label)
+ return log_oom();
+
+ token_uri = uri_from_token_info(token_info);
+ if (!token_uri)
+ return log_oom();
+
+ uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, &token_uri_string);
+ if (uri_result != P11_KIT_URI_OK)
+ return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN), "Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result));
+
+ r = pkcs11_token_login_by_pin(m, session, token_info, token_label, /* pin= */ NULL, 0);
+ if (r == 0 && ret_used_pin)
+ *ret_used_pin = NULL;
+
+ if (r != -ENOANO) /* pin required */
+ return r;
+
+ token_uri_escaped = cescape(token_uri_string);
+ if (!token_uri_escaped)
+ return log_oom();
+
+ id = strjoin("pkcs11:", token_uri_escaped);
+ if (!id)
+ return log_oom();
+
+ for (unsigned tries = 0; tries < 3; tries++) {
+ _cleanup_strv_free_erase_ char **passwords = NULL;
+ _cleanup_(erase_and_freep) char *envpin = NULL;
+
+ r = getenv_steal_erase("PIN", &envpin);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire PIN from environment: %m");
+ if (r > 0) {
+ passwords = strv_new(envpin);
+ if (!passwords)
+ return log_oom();
+
+ } else if (headless)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOPKG), "PIN querying disabled via 'headless' option. Use the 'PIN' environment variable.");
+ else {
+ _cleanup_free_ char *text = NULL;
+
+ if (FLAGS_SET(token_info->flags, CKF_USER_PIN_FINAL_TRY))
+ r = asprintf(&text,
+ "Please enter correct PIN for security token '%s' in order to unlock %s (final try):",
+ token_label, friendly_name);
+ else if (FLAGS_SET(token_info->flags, CKF_USER_PIN_COUNT_LOW))
+ r = asprintf(&text,
+ "PIN has been entered incorrectly previously, please enter correct PIN for security token '%s' in order to unlock %s:",
+ token_label, friendly_name);
+ else if (tries == 0)
+ r = asprintf(&text,
+ "Please enter PIN for security token '%s' in order to unlock %s:",
+ token_label, friendly_name);
+ else
+ r = asprintf(&text,
+ "Please enter PIN for security token '%s' in order to unlock %s (try #%u):",
+ token_label, friendly_name, tries+1);
+ if (r < 0)
+ return log_oom();
+
+ /* We never cache PINs, simply because it's fatal if we use wrong PINs, since usually there are only 3 tries */
+ r = ask_password_auto(text, icon_name, id, key_name, credential_name, until, ask_password_flags, &passwords);
+ if (r < 0)
+ return log_error_errno(r, "Failed to query PIN for security token '%s': %m", token_label);
+ }
+
+ STRV_FOREACH(i, passwords) {
+ r = pkcs11_token_login_by_pin(m, session, token_info, token_label, *i, strlen(*i));
+ if (r == 0 && ret_used_pin) {
+ char *c;
+
+ c = strdup(*i);
+ if (!c)
+ return log_oom();
+
+ *ret_used_pin = c;
+ }
+
+ if (r != -ENOLCK)
+ return r;
+
+ /* Refresh the token info, so that we can prompt knowing the new flags if they changed. */
+ rv = m->C_GetTokenInfo(slotid, &updated_token_info);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to acquire updated security token information for slot %lu: %s",
+ slotid, sym_p11_kit_strerror(rv));
+
+ token_info = &updated_token_info;
+ }
+ }
+
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Too many attempts to log into token '%s'.", token_label);
+}
+
+int pkcs11_token_find_x509_certificate(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ P11KitUri *search_uri,
+ CK_OBJECT_HANDLE *ret_object) {
+
+ bool found_class = false, found_certificate_type = false;
+ _cleanup_free_ CK_ATTRIBUTE *attributes_buffer = NULL;
+ CK_ULONG n_attributes, a, n_objects;
+ CK_ATTRIBUTE *attributes = NULL;
+ CK_OBJECT_HANDLE objects[2];
+ CK_RV rv, rv2;
+ int r;
+
+ assert(m);
+ assert(search_uri);
+ assert(ret_object);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ attributes = sym_p11_kit_uri_get_attributes(search_uri, &n_attributes);
+ for (a = 0; a < n_attributes; a++) {
+
+ /* We use the URI's included match attributes, but make them more strict. This allows users
+ * to specify a token URL instead of an object URL and the right thing should happen if
+ * there's only one suitable key on the token. */
+
+ switch (attributes[a].type) {
+
+ case CKA_CLASS: {
+ CK_OBJECT_CLASS c;
+
+ if (attributes[a].ulValueLen != sizeof(c))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_CLASS attribute size.");
+
+ memcpy(&c, attributes[a].pValue, sizeof(c));
+ if (c != CKO_CERTIFICATE)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected PKCS#11 object is not an X.509 certificate, refusing.");
+
+ found_class = true;
+ break;
+ }
+
+ case CKA_CERTIFICATE_TYPE: {
+ CK_CERTIFICATE_TYPE t;
+
+ if (attributes[a].ulValueLen != sizeof(t))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_CERTIFICATE_TYPE attribute size.");
+
+ memcpy(&t, attributes[a].pValue, sizeof(t));
+ if (t != CKC_X_509)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected PKCS#11 object is not an X.509 certificate, refusing.");
+
+ found_certificate_type = true;
+ break;
+ }}
+ }
+
+ if (!found_class || !found_certificate_type) {
+ /* Hmm, let's slightly extend the attribute list we search for */
+
+ attributes_buffer = new(CK_ATTRIBUTE, n_attributes + !found_class + !found_certificate_type);
+ if (!attributes_buffer)
+ return log_oom();
+
+ memcpy(attributes_buffer, attributes, sizeof(CK_ATTRIBUTE) * n_attributes);
+
+ if (!found_class) {
+ static const CK_OBJECT_CLASS class = CKO_CERTIFICATE;
+
+ attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) {
+ .type = CKA_CLASS,
+ .pValue = (CK_OBJECT_CLASS*) &class,
+ .ulValueLen = sizeof(class),
+ };
+ }
+
+ if (!found_certificate_type) {
+ static const CK_CERTIFICATE_TYPE type = CKC_X_509;
+
+ attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) {
+ .type = CKA_CERTIFICATE_TYPE,
+ .pValue = (CK_CERTIFICATE_TYPE*) &type,
+ .ulValueLen = sizeof(type),
+ };
+ }
+
+ attributes = attributes_buffer;
+ }
+
+ rv = m->C_FindObjectsInit(session, attributes, n_attributes);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to initialize object find call: %s", sym_p11_kit_strerror(rv));
+
+ rv = m->C_FindObjects(session, objects, ELEMENTSOF(objects), &n_objects);
+ rv2 = m->C_FindObjectsFinal(session);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to find objects: %s", sym_p11_kit_strerror(rv));
+ if (rv2 != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to finalize object find call: %s", sym_p11_kit_strerror(rv));
+ if (n_objects == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+ "Failed to find selected X509 certificate on token.");
+ if (n_objects > 1)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ),
+ "Configured URI matches multiple certificates, refusing.");
+
+ *ret_object = objects[0];
+ return 0;
+}
+
+#if HAVE_OPENSSL
+int pkcs11_token_read_x509_certificate(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_OBJECT_HANDLE object,
+ X509 **ret_cert) {
+
+ _cleanup_free_ void *buffer = NULL;
+ _cleanup_free_ char *t = NULL;
+ CK_ATTRIBUTE attribute = {
+ .type = CKA_VALUE
+ };
+ CK_RV rv;
+ _cleanup_(X509_freep) X509 *x509 = NULL;
+ X509_NAME *name = NULL;
+ const unsigned char *p;
+ int r;
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ rv = m->C_GetAttributeValue(session, object, &attribute, 1);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to read X.509 certificate size off token: %s", sym_p11_kit_strerror(rv));
+
+ buffer = malloc(attribute.ulValueLen);
+ if (!buffer)
+ return log_oom();
+
+ attribute.pValue = buffer;
+
+ rv = m->C_GetAttributeValue(session, object, &attribute, 1);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to read X.509 certificate data off token: %s", sym_p11_kit_strerror(rv));
+
+ p = attribute.pValue;
+ x509 = d2i_X509(NULL, &p, attribute.ulValueLen);
+ if (!x509)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed parse X.509 certificate.");
+
+ name = X509_get_subject_name(x509);
+ if (!name)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed to acquire X.509 subject name.");
+
+ t = X509_NAME_oneline(name, NULL, 0);
+ if (!t)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to format X.509 subject name as string.");
+
+ log_debug("Using X.509 certificate issued for '%s'.", t);
+
+ *ret_cert = TAKE_PTR(x509);
+ return 0;
+}
+#endif
+
+int pkcs11_token_find_private_key(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ P11KitUri *search_uri,
+ CK_OBJECT_HANDLE *ret_object) {
+
+ bool found_decrypt = false, found_class = false, found_key_type = false;
+ _cleanup_free_ CK_ATTRIBUTE *attributes_buffer = NULL;
+ CK_ULONG n_attributes, a, n_objects;
+ CK_ATTRIBUTE *attributes = NULL;
+ CK_OBJECT_HANDLE objects[2];
+ CK_RV rv, rv2;
+ int r;
+
+ assert(m);
+ assert(search_uri);
+ assert(ret_object);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ attributes = sym_p11_kit_uri_get_attributes(search_uri, &n_attributes);
+ for (a = 0; a < n_attributes; a++) {
+
+ /* We use the URI's included match attributes, but make them more strict. This allows users
+ * to specify a token URL instead of an object URL and the right thing should happen if
+ * there's only one suitable key on the token. */
+
+ switch (attributes[a].type) {
+
+ case CKA_CLASS: {
+ CK_OBJECT_CLASS c;
+
+ if (attributes[a].ulValueLen != sizeof(c))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_CLASS attribute size.");
+
+ memcpy(&c, attributes[a].pValue, sizeof(c));
+ if (c != CKO_PRIVATE_KEY)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Selected PKCS#11 object is not a private key, refusing.");
+
+ found_class = true;
+ break;
+ }
+
+ case CKA_DECRYPT: {
+ CK_BBOOL b;
+
+ if (attributes[a].ulValueLen != sizeof(b))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_DECRYPT attribute size.");
+
+ memcpy(&b, attributes[a].pValue, sizeof(b));
+ if (!b)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Selected PKCS#11 object is not suitable for decryption, refusing.");
+
+ found_decrypt = true;
+ break;
+ }
+
+ case CKA_KEY_TYPE: {
+ CK_KEY_TYPE t;
+
+ if (attributes[a].ulValueLen != sizeof(t))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PKCS#11 CKA_KEY_TYPE attribute size.");
+
+ memcpy(&t, attributes[a].pValue, sizeof(t));
+ if (t != CKK_RSA)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Selected PKCS#11 object is not an RSA key, refusing.");
+
+ found_key_type = true;
+ break;
+ }}
+ }
+
+ if (!found_decrypt || !found_class || !found_key_type) {
+ /* Hmm, let's slightly extend the attribute list we search for */
+
+ attributes_buffer = new(CK_ATTRIBUTE, n_attributes + !found_decrypt + !found_class + !found_key_type);
+ if (!attributes_buffer)
+ return log_oom();
+
+ memcpy(attributes_buffer, attributes, sizeof(CK_ATTRIBUTE) * n_attributes);
+
+ if (!found_decrypt) {
+ static const CK_BBOOL yes = true;
+
+ attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) {
+ .type = CKA_DECRYPT,
+ .pValue = (CK_BBOOL*) &yes,
+ .ulValueLen = sizeof(yes),
+ };
+ }
+
+ if (!found_class) {
+ static const CK_OBJECT_CLASS class = CKO_PRIVATE_KEY;
+
+ attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) {
+ .type = CKA_CLASS,
+ .pValue = (CK_OBJECT_CLASS*) &class,
+ .ulValueLen = sizeof(class),
+ };
+ }
+
+ if (!found_key_type) {
+ static const CK_KEY_TYPE type = CKK_RSA;
+
+ attributes_buffer[n_attributes++] = (CK_ATTRIBUTE) {
+ .type = CKA_KEY_TYPE,
+ .pValue = (CK_KEY_TYPE*) &type,
+ .ulValueLen = sizeof(type),
+ };
+ }
+
+ attributes = attributes_buffer;
+ }
+
+ rv = m->C_FindObjectsInit(session, attributes, n_attributes);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to initialize object find call: %s", sym_p11_kit_strerror(rv));
+
+ rv = m->C_FindObjects(session, objects, ELEMENTSOF(objects), &n_objects);
+ rv2 = m->C_FindObjectsFinal(session);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to find objects: %s", sym_p11_kit_strerror(rv));
+ if (rv2 != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to finalize object find call: %s", sym_p11_kit_strerror(rv));
+ if (n_objects == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
+ "Failed to find selected private key suitable for decryption on token.");
+ if (n_objects > 1)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ),
+ "Configured private key URI matches multiple keys, refusing.");
+
+ *ret_object = objects[0];
+ return 0;
+}
+
+int pkcs11_token_decrypt_data(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_OBJECT_HANDLE object,
+ const void *encrypted_data,
+ size_t encrypted_data_size,
+ void **ret_decrypted_data,
+ size_t *ret_decrypted_data_size) {
+
+ static const CK_MECHANISM mechanism = {
+ .mechanism = CKM_RSA_PKCS
+ };
+ _cleanup_(erase_and_freep) CK_BYTE *dbuffer = NULL;
+ CK_ULONG dbuffer_size = 0;
+ CK_RV rv;
+ int r;
+
+ assert(m);
+ assert(encrypted_data);
+ assert(encrypted_data_size > 0);
+ assert(ret_decrypted_data);
+ assert(ret_decrypted_data_size);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ rv = m->C_DecryptInit(session, (CK_MECHANISM*) &mechanism, object);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to initialize decryption on security token: %s", sym_p11_kit_strerror(rv));
+
+ dbuffer_size = encrypted_data_size; /* Start with something reasonable */
+ dbuffer = malloc(dbuffer_size);
+ if (!dbuffer)
+ return log_oom();
+
+ rv = m->C_Decrypt(session, (CK_BYTE*) encrypted_data, encrypted_data_size, dbuffer, &dbuffer_size);
+ if (rv == CKR_BUFFER_TOO_SMALL) {
+ erase_and_free(dbuffer);
+
+ dbuffer = malloc(dbuffer_size);
+ if (!dbuffer)
+ return log_oom();
+
+ rv = m->C_Decrypt(session, (CK_BYTE*) encrypted_data, encrypted_data_size, dbuffer, &dbuffer_size);
+ }
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to decrypt key on security token: %s", sym_p11_kit_strerror(rv));
+
+ log_info("Successfully decrypted key with security token.");
+
+ *ret_decrypted_data = TAKE_PTR(dbuffer);
+ *ret_decrypted_data_size = dbuffer_size;
+ return 0;
+}
+
+int pkcs11_token_acquire_rng(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session) {
+
+ _cleanup_free_ void *buffer = NULL;
+ size_t rps;
+ CK_RV rv;
+ int r;
+
+ assert(m);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ /* While we are at it, let's read some RNG data from the PKCS#11 token and pass it to the kernel
+ * random pool. This should be cheap if we are talking to the device already. Note that we don't
+ * credit any entropy, since we don't know about the quality of the pkcs#11 token's RNG. Why bother
+ * at all? There are two sides to the argument whether to generate private keys on tokens or on the
+ * host. By crediting some data from the token RNG to the host's pool we at least can say that any
+ * key generated from it is at least as good as both sources individually. */
+
+ rps = random_pool_size();
+
+ buffer = malloc(rps);
+ if (!buffer)
+ return log_oom();
+
+ rv = m->C_GenerateRandom(session, buffer, rps);
+ if (rv != CKR_OK)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Failed to generate RNG data on security token: %s", sym_p11_kit_strerror(rv));
+
+ r = random_write_entropy(-1, buffer, rps, false);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to write PKCS#11 acquired random data to /dev/urandom: %m");
+
+ log_debug("Successfully written %zu bytes random data acquired via PKCS#11 to kernel random pool.", rps);
+
+ return 0;
+}
+
+static int token_process(
+ CK_FUNCTION_LIST *m,
+ CK_SLOT_ID slotid,
+ const CK_SLOT_INFO *slot_info,
+ const CK_TOKEN_INFO *token_info,
+ P11KitUri *search_uri,
+ pkcs11_find_token_callback_t callback,
+ void *userdata) {
+
+ _cleanup_free_ char *token_label = NULL;
+ CK_SESSION_HANDLE session;
+ CK_RV rv;
+ int r;
+
+ assert(m);
+ assert(slot_info);
+ assert(token_info);
+
+ token_label = pkcs11_token_label(token_info);
+ if (!token_label)
+ return log_oom();
+
+ rv = m->C_OpenSession(slotid, CKF_SERIAL_SESSION, NULL, NULL, &session);
+ if (rv != CKR_OK)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO),
+ "Failed to create session for security token '%s': %s", token_label, sym_p11_kit_strerror(rv));
+
+ if (callback)
+ r = callback(m, session, slotid, slot_info, token_info, search_uri, userdata);
+ else
+ r = 1; /* if not callback was specified, just say we found what we were looking for */
+
+ rv = m->C_CloseSession(session);
+ if (rv != CKR_OK)
+ log_warning("Failed to close session on PKCS#11 token, ignoring: %s", sym_p11_kit_strerror(rv));
+
+ return r;
+}
+
+static int slot_process(
+ CK_FUNCTION_LIST *m,
+ CK_SLOT_ID slotid,
+ P11KitUri *search_uri,
+ pkcs11_find_token_callback_t callback,
+ void *userdata) {
+
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri* slot_uri = NULL, *token_uri = NULL;
+ _cleanup_free_ char *token_uri_string = NULL;
+ CK_TOKEN_INFO token_info;
+ CK_SLOT_INFO slot_info;
+ int uri_result, r;
+ CK_RV rv;
+
+ assert(m);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ /* We return -EAGAIN for all failures we can attribute to a specific slot in some way, so that the
+ * caller might try other slots before giving up. */
+
+ rv = m->C_GetSlotInfo(slotid, &slot_info);
+ if (rv != CKR_OK) {
+ log_warning("Failed to acquire slot info for slot %lu, ignoring slot: %s", slotid, sym_p11_kit_strerror(rv));
+ return -EAGAIN;
+ }
+
+ slot_uri = uri_from_slot_info(&slot_info);
+ if (!slot_uri)
+ return log_oom();
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *slot_uri_string = NULL;
+
+ uri_result = sym_p11_kit_uri_format(slot_uri, P11_KIT_URI_FOR_ANY, &slot_uri_string);
+ if (uri_result != P11_KIT_URI_OK) {
+ log_warning("Failed to format slot URI, ignoring slot: %s", sym_p11_kit_uri_message(uri_result));
+ return -EAGAIN;
+ }
+
+ log_debug("Found slot with URI %s", slot_uri_string);
+ }
+
+ rv = m->C_GetTokenInfo(slotid, &token_info);
+ if (rv == CKR_TOKEN_NOT_PRESENT) {
+ return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+ "Token not present in slot, ignoring.");
+ } else if (rv != CKR_OK) {
+ log_warning("Failed to acquire token info for slot %lu, ignoring slot: %s", slotid, sym_p11_kit_strerror(rv));
+ return -EAGAIN;
+ }
+
+ token_uri = uri_from_token_info(&token_info);
+ if (!token_uri)
+ return log_oom();
+
+ uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, &token_uri_string);
+ if (uri_result != P11_KIT_URI_OK) {
+ log_warning("Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result));
+ return -EAGAIN;
+ }
+
+ if (search_uri && !sym_p11_kit_uri_match_token_info(search_uri, &token_info))
+ return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+ "Found non-matching token with URI %s.",
+ token_uri_string);
+
+ log_debug("Found matching token with URI %s.", token_uri_string);
+
+ return token_process(
+ m,
+ slotid,
+ &slot_info,
+ &token_info,
+ search_uri,
+ callback,
+ userdata);
+}
+
+static int module_process(
+ CK_FUNCTION_LIST *m,
+ P11KitUri *search_uri,
+ pkcs11_find_token_callback_t callback,
+ void *userdata) {
+
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri* module_uri = NULL;
+ _cleanup_free_ char *name = NULL, *module_uri_string = NULL;
+ _cleanup_free_ CK_SLOT_ID *slotids = NULL;
+ CK_ULONG n_slotids = 0;
+ int uri_result;
+ CK_INFO info;
+ size_t k;
+ CK_RV rv;
+ int r;
+
+ assert(m);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ /* We ignore most errors from modules here, in order to skip over faulty modules: one faulty module
+ * should not have the effect that we don't try the others anymore. We indicate such per-module
+ * failures with -EAGAIN, which let's the caller try the next module. */
+
+ name = sym_p11_kit_module_get_name(m);
+ if (!name)
+ return log_oom();
+
+ log_debug("Trying PKCS#11 module %s.", name);
+
+ rv = m->C_GetInfo(&info);
+ if (rv != CKR_OK) {
+ log_warning("Failed to get info on PKCS#11 module, ignoring module: %s", sym_p11_kit_strerror(rv));
+ return -EAGAIN;
+ }
+
+ module_uri = uri_from_module_info(&info);
+ if (!module_uri)
+ return log_oom();
+
+ uri_result = sym_p11_kit_uri_format(module_uri, P11_KIT_URI_FOR_ANY, &module_uri_string);
+ if (uri_result != P11_KIT_URI_OK) {
+ log_warning("Failed to format module URI, ignoring module: %s", sym_p11_kit_uri_message(uri_result));
+ return -EAGAIN;
+ }
+
+ log_debug("Found module with URI %s", module_uri_string);
+
+ rv = pkcs11_get_slot_list_malloc(m, &slotids, &n_slotids);
+ if (rv != CKR_OK) {
+ log_warning("Failed to get slot list, ignoring module: %s", sym_p11_kit_strerror(rv));
+ return -EAGAIN;
+ }
+ if (n_slotids == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN),
+ "This module has no slots? Ignoring module.");
+
+ for (k = 0; k < n_slotids; k++) {
+ r = slot_process(
+ m,
+ slotids[k],
+ search_uri,
+ callback,
+ userdata);
+ if (r != -EAGAIN)
+ return r;
+ }
+
+ return -EAGAIN;
+}
+
+int pkcs11_find_token(
+ const char *pkcs11_uri,
+ pkcs11_find_token_callback_t callback,
+ void *userdata) {
+
+ _cleanup_(sym_p11_kit_modules_finalize_and_releasep) CK_FUNCTION_LIST **modules = NULL;
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri *search_uri = NULL;
+ int r;
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ /* Execute the specified callback for each matching token found. If nothing is found returns
+ * -EAGAIN. Logs about all errors, except for EAGAIN, which the caller has to log about. */
+
+ if (pkcs11_uri) {
+ r = uri_from_string(pkcs11_uri, &search_uri);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse PKCS#11 URI '%s': %m", pkcs11_uri);
+ }
+
+ modules = sym_p11_kit_modules_load_and_initialize(0);
+ if (!modules)
+ return log_error_errno(SYNTHETIC_ERRNO(EIO), "Failed to initialize pkcs11 modules");
+
+ for (CK_FUNCTION_LIST **i = modules; *i; i++) {
+ r = module_process(
+ *i,
+ search_uri,
+ callback,
+ userdata);
+ if (r != -EAGAIN)
+ return r;
+ }
+
+ return -EAGAIN;
+}
+
+#if HAVE_OPENSSL
+struct pkcs11_acquire_certificate_callback_data {
+ char *pin_used;
+ X509 *cert;
+ const char *askpw_friendly_name, *askpw_icon_name;
+ AskPasswordFlags askpw_flags;
+ bool headless;
+};
+
+static void pkcs11_acquire_certificate_callback_data_release(struct pkcs11_acquire_certificate_callback_data *data) {
+ erase_and_free(data->pin_used);
+ X509_free(data->cert);
+}
+
+static int pkcs11_acquire_certificate_callback(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_SLOT_ID slot_id,
+ const CK_SLOT_INFO *slot_info,
+ const CK_TOKEN_INFO *token_info,
+ P11KitUri *uri,
+ void *userdata) {
+
+ _cleanup_(erase_and_freep) char *pin_used = NULL;
+ struct pkcs11_acquire_certificate_callback_data *data = ASSERT_PTR(userdata);
+ CK_OBJECT_HANDLE object;
+ int r;
+
+ assert(m);
+ assert(slot_info);
+ assert(token_info);
+ assert(uri);
+
+ /* Called for every token matching our URI */
+
+ r = pkcs11_token_login(
+ m,
+ session,
+ slot_id,
+ token_info,
+ data->askpw_friendly_name,
+ data->askpw_icon_name,
+ "pkcs11-pin",
+ "pkcs11-pin",
+ UINT64_MAX,
+ data->askpw_flags,
+ data->headless,
+ &pin_used);
+ if (r < 0)
+ return r;
+
+ r = pkcs11_token_find_x509_certificate(m, session, uri, &object);
+ if (r < 0)
+ return r;
+
+ r = pkcs11_token_read_x509_certificate(m, session, object, &data->cert);
+ if (r < 0)
+ return r;
+
+ /* Let's read some random data off the token and write it to the kernel pool before we generate our
+ * random key from it. This way we can claim the quality of the RNG is at least as good as the
+ * kernel's and the token's pool */
+ (void) pkcs11_token_acquire_rng(m, session);
+
+ data->pin_used = TAKE_PTR(pin_used);
+ return 1;
+}
+
+int pkcs11_acquire_certificate(
+ const char *uri,
+ const char *askpw_friendly_name,
+ const char *askpw_icon_name,
+ X509 **ret_cert,
+ char **ret_pin_used) {
+
+ _cleanup_(pkcs11_acquire_certificate_callback_data_release) struct pkcs11_acquire_certificate_callback_data data = {
+ .askpw_friendly_name = askpw_friendly_name,
+ .askpw_icon_name = askpw_icon_name,
+ };
+ int r;
+
+ assert(uri);
+ assert(ret_cert);
+
+ r = pkcs11_find_token(uri, pkcs11_acquire_certificate_callback, &data);
+ if (r == -EAGAIN) /* pkcs11_find_token() doesn't log about this error, but all others */
+ return log_error_errno(SYNTHETIC_ERRNO(ENXIO),
+ "Specified PKCS#11 token with URI '%s' not found.",
+ uri);
+ if (r < 0)
+ return r;
+
+ *ret_cert = TAKE_PTR(data.cert);
+
+ if (ret_pin_used)
+ *ret_pin_used = TAKE_PTR(data.pin_used);
+
+ return 0;
+}
+#endif
+
+static int list_callback(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_SLOT_ID slot_id,
+ const CK_SLOT_INFO *slot_info,
+ const CK_TOKEN_INFO *token_info,
+ P11KitUri *uri,
+ void *userdata) {
+
+ _cleanup_free_ char *token_uri_string = NULL, *token_label = NULL, *token_manufacturer_id = NULL, *token_model = NULL;
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri *token_uri = NULL;
+ Table *t = userdata;
+ int uri_result, r;
+
+ assert(slot_info);
+ assert(token_info);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ /* We only care about hardware devices here with a token inserted. Let's filter everything else
+ * out. (Note that the user can explicitly specify non-hardware tokens if they like, but during
+ * enumeration we'll filter those, since software tokens are typically the system certificate store
+ * and such, and it's typically not what people want to bind their home directories to.) */
+ if (!FLAGS_SET(slot_info->flags, CKF_HW_SLOT|CKF_TOKEN_PRESENT))
+ return -EAGAIN;
+
+ token_label = pkcs11_token_label(token_info);
+ if (!token_label)
+ return log_oom();
+
+ token_manufacturer_id = pkcs11_token_manufacturer_id(token_info);
+ if (!token_manufacturer_id)
+ return log_oom();
+
+ token_model = pkcs11_token_model(token_info);
+ if (!token_model)
+ return log_oom();
+
+ token_uri = uri_from_token_info(token_info);
+ if (!token_uri)
+ return log_oom();
+
+ uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, &token_uri_string);
+ if (uri_result != P11_KIT_URI_OK)
+ return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN), "Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result));
+
+ r = table_add_many(
+ t,
+ TABLE_STRING, token_uri_string,
+ TABLE_STRING, token_label,
+ TABLE_STRING, token_manufacturer_id,
+ TABLE_STRING, token_model);
+ if (r < 0)
+ return table_log_add_error(r);
+
+ return -EAGAIN; /* keep scanning */
+}
+#endif
+
+int pkcs11_list_tokens(void) {
+#if HAVE_P11KIT
+ _cleanup_(table_unrefp) Table *t = NULL;
+ int r;
+
+ t = table_new("uri", "label", "manufacturer", "model");
+ if (!t)
+ return log_oom();
+
+ r = pkcs11_find_token(NULL, list_callback, t);
+ if (r < 0 && r != -EAGAIN)
+ return r;
+
+ if (table_get_rows(t) <= 1) {
+ log_info("No suitable PKCS#11 tokens found.");
+ return 0;
+ }
+
+ r = table_print(t, stdout);
+ if (r < 0)
+ return log_error_errno(r, "Failed to show device table: %m");
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "PKCS#11 tokens not supported on this build.");
+#endif
+}
+
+#if HAVE_P11KIT
+static int auto_callback(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_SLOT_ID slot_id,
+ const CK_SLOT_INFO *slot_info,
+ const CK_TOKEN_INFO *token_info,
+ P11KitUri *uri,
+ void *userdata) {
+
+ _cleanup_(sym_p11_kit_uri_freep) P11KitUri *token_uri = NULL;
+ char **t = userdata;
+ int uri_result, r;
+
+ assert(slot_info);
+ assert(token_info);
+
+ r = dlopen_p11kit();
+ if (r < 0)
+ return r;
+
+ if (!FLAGS_SET(token_info->flags, CKF_HW_SLOT|CKF_TOKEN_PRESENT))
+ return -EAGAIN;
+
+ if (*t)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTUNIQ),
+ "More than one suitable PKCS#11 token found.");
+
+ token_uri = uri_from_token_info(token_info);
+ if (!token_uri)
+ return log_oom();
+
+ uri_result = sym_p11_kit_uri_format(token_uri, P11_KIT_URI_FOR_ANY, t);
+ if (uri_result != P11_KIT_URI_OK)
+ return log_warning_errno(SYNTHETIC_ERRNO(EAGAIN), "Failed to format slot URI: %s", sym_p11_kit_uri_message(uri_result));
+
+ return 0;
+}
+#endif
+
+int pkcs11_find_token_auto(char **ret) {
+#if HAVE_P11KIT
+ int r;
+
+ r = pkcs11_find_token(NULL, auto_callback, ret);
+ if (r == -EAGAIN)
+ return log_error_errno(SYNTHETIC_ERRNO(ENODEV), "No suitable PKCS#11 tokens found.");
+ if (r < 0)
+ return r;
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "PKCS#11 tokens not supported on this build.");
+#endif
+}
+
+#if HAVE_P11KIT
+void pkcs11_crypt_device_callback_data_release(pkcs11_crypt_device_callback_data *data) {
+ erase_and_free(data->decrypted_key);
+
+ if (data->free_encrypted_key)
+ free(data->encrypted_key);
+}
+
+int pkcs11_crypt_device_callback(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_SLOT_ID slot_id,
+ const CK_SLOT_INFO *slot_info,
+ const CK_TOKEN_INFO *token_info,
+ P11KitUri *uri,
+ void *userdata) {
+
+ pkcs11_crypt_device_callback_data *data = ASSERT_PTR(userdata);
+ CK_OBJECT_HANDLE object;
+ int r;
+
+ assert(m);
+ assert(slot_info);
+ assert(token_info);
+ assert(uri);
+
+ /* Called for every token matching our URI */
+
+ r = pkcs11_token_login(
+ m,
+ session,
+ slot_id,
+ token_info,
+ data->friendly_name,
+ "drive-harddisk",
+ "pkcs11-pin",
+ "cryptsetup.pkcs11-pin",
+ data->until,
+ data->askpw_flags,
+ data->headless,
+ NULL);
+ if (r < 0)
+ return r;
+
+ /* We are likely called during early boot, where entropy is scarce. Mix some data from the PKCS#11
+ * token, if it supports that. It should be cheap, given that we already are talking to it anyway and
+ * shouldn't hurt. */
+ (void) pkcs11_token_acquire_rng(m, session);
+
+ r = pkcs11_token_find_private_key(m, session, uri, &object);
+ if (r < 0)
+ return r;
+
+ r = pkcs11_token_decrypt_data(
+ m,
+ session,
+ object,
+ data->encrypted_key,
+ data->encrypted_key_size,
+ &data->decrypted_key,
+ &data->decrypted_key_size);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+#endif
diff --git a/src/shared/pkcs11-util.h b/src/shared/pkcs11-util.h
new file mode 100644
index 0000000..5bc23c1
--- /dev/null
+++ b/src/shared/pkcs11-util.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#if HAVE_P11KIT
+# include <p11-kit/p11-kit.h>
+# include <p11-kit/uri.h>
+#endif
+
+#include "ask-password-api.h"
+#include "macro.h"
+#include "openssl-util.h"
+#include "time-util.h"
+
+bool pkcs11_uri_valid(const char *uri);
+
+#if HAVE_P11KIT
+
+extern char *(*sym_p11_kit_module_get_name)(CK_FUNCTION_LIST *module);
+extern void (*sym_p11_kit_modules_finalize_and_release)(CK_FUNCTION_LIST **modules);
+extern CK_FUNCTION_LIST **(*sym_p11_kit_modules_load_and_initialize)(int flags);
+extern const char *(*sym_p11_kit_strerror)(CK_RV rv);
+extern int (*sym_p11_kit_uri_format)(P11KitUri *uri, P11KitUriType uri_type, char **string);
+extern void (*sym_p11_kit_uri_free)(P11KitUri *uri);
+extern CK_ATTRIBUTE_PTR (*sym_p11_kit_uri_get_attributes)(P11KitUri *uri, CK_ULONG *n_attrs);
+extern CK_INFO_PTR (*sym_p11_kit_uri_get_module_info)(P11KitUri *uri);
+extern CK_SLOT_INFO_PTR (*sym_p11_kit_uri_get_slot_info)(P11KitUri *uri);
+extern CK_TOKEN_INFO_PTR (*sym_p11_kit_uri_get_token_info)(P11KitUri *uri);
+extern int (*sym_p11_kit_uri_match_token_info)(const P11KitUri *uri, const CK_TOKEN_INFO *token_info);
+extern const char *(*sym_p11_kit_uri_message)(int code);
+extern P11KitUri *(*sym_p11_kit_uri_new)(void);
+extern int (*sym_p11_kit_uri_parse)(const char *string, P11KitUriType uri_type, P11KitUri *uri);
+
+int uri_from_string(const char *p, P11KitUri **ret);
+
+P11KitUri *uri_from_module_info(const CK_INFO *info);
+P11KitUri *uri_from_slot_info(const CK_SLOT_INFO *slot_info);
+P11KitUri *uri_from_token_info(const CK_TOKEN_INFO *token_info);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(P11KitUri*, sym_p11_kit_uri_free, NULL);
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(CK_FUNCTION_LIST**, sym_p11_kit_modules_finalize_and_release, NULL);
+
+CK_RV pkcs11_get_slot_list_malloc(CK_FUNCTION_LIST *m, CK_SLOT_ID **ret_slotids, CK_ULONG *ret_n_slotids);
+
+char *pkcs11_token_label(const CK_TOKEN_INFO *token_info);
+char *pkcs11_token_manufacturer_id(const CK_TOKEN_INFO *token_info);
+char *pkcs11_token_model(const CK_TOKEN_INFO *token_info);
+
+int pkcs11_token_login_by_pin(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, const CK_TOKEN_INFO *token_info, const char *token_label, const void *pin, size_t pin_size);
+int pkcs11_token_login(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_SLOT_ID slotid, const CK_TOKEN_INFO *token_info, const char *friendly_name, const char *icon_name, const char *key_name, const char *credential_name, usec_t until, AskPasswordFlags ask_password_flags, bool headless, char **ret_used_pin);
+
+int pkcs11_token_find_x509_certificate(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, P11KitUri *search_uri, CK_OBJECT_HANDLE *ret_object);
+#if HAVE_OPENSSL
+int pkcs11_token_read_x509_certificate(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_OBJECT_HANDLE object, X509 **ret_cert);
+#endif
+
+int pkcs11_token_find_private_key(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, P11KitUri *search_uri, CK_OBJECT_HANDLE *ret_object);
+int pkcs11_token_decrypt_data(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_OBJECT_HANDLE object, const void *encrypted_data, size_t encrypted_data_size, void **ret_decrypted_data, size_t *ret_decrypted_data_size);
+
+int pkcs11_token_acquire_rng(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session);
+
+typedef int (*pkcs11_find_token_callback_t)(CK_FUNCTION_LIST *m, CK_SESSION_HANDLE session, CK_SLOT_ID slotid, const CK_SLOT_INFO *slot_info, const CK_TOKEN_INFO *token_info, P11KitUri *uri, void *userdata);
+int pkcs11_find_token(const char *pkcs11_uri, pkcs11_find_token_callback_t callback, void *userdata);
+
+#if HAVE_OPENSSL
+int pkcs11_acquire_certificate(const char *uri, const char *askpw_friendly_name, const char *askpw_icon_name, X509 **ret_cert, char **ret_pin_used);
+#endif
+
+typedef struct {
+ const char *friendly_name;
+ usec_t until;
+ void *encrypted_key;
+ size_t encrypted_key_size;
+ void *decrypted_key;
+ size_t decrypted_key_size;
+ bool free_encrypted_key;
+ bool headless;
+ AskPasswordFlags askpw_flags;
+} pkcs11_crypt_device_callback_data;
+
+void pkcs11_crypt_device_callback_data_release(pkcs11_crypt_device_callback_data *data);
+
+int pkcs11_crypt_device_callback(
+ CK_FUNCTION_LIST *m,
+ CK_SESSION_HANDLE session,
+ CK_SLOT_ID slot_id,
+ const CK_SLOT_INFO *slot_info,
+ const CK_TOKEN_INFO *token_info,
+ P11KitUri *uri,
+ void *userdata);
+
+int dlopen_p11kit(void);
+
+#else
+
+static inline int dlopen_p11kit(void) {
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "p11kit support is not compiled in.");
+}
+
+#endif
+
+typedef struct {
+ const char *friendly_name;
+ usec_t until;
+ bool headless;
+ AskPasswordFlags askpw_flags;
+} systemd_pkcs11_plugin_params;
+
+int pkcs11_list_tokens(void);
+int pkcs11_find_token_auto(char **ret);
diff --git a/src/shared/plymouth-util.c b/src/shared/plymouth-util.c
new file mode 100644
index 0000000..31ab340
--- /dev/null
+++ b/src/shared/plymouth-util.c
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "fd-util.h"
+#include "io-util.h"
+#include "plymouth-util.h"
+#include "socket-util.h"
+
+int plymouth_connect(int flags) {
+ static const union sockaddr_union sa = {
+ .un.sun_family = AF_UNIX,
+ .un.sun_path = "\0/org/freedesktop/plymouthd",
+ };
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|flags, 0);
+ if (fd < 0)
+ return -errno;
+
+ if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0)
+ return -errno;
+
+ return TAKE_FD(fd);
+}
+
+int plymouth_send_raw(const void *raw, size_t size, int flags) {
+ _cleanup_close_ int fd = -EBADF;
+
+ fd = plymouth_connect(flags);
+ if (fd < 0)
+ return fd;
+
+ return loop_write(fd, raw, size);
+}
diff --git a/src/shared/plymouth-util.h b/src/shared/plymouth-util.h
new file mode 100644
index 0000000..04aec70
--- /dev/null
+++ b/src/shared/plymouth-util.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/types.h>
+
+#include "errno-util.h"
+
+int plymouth_connect(int flags);
+int plymouth_send_raw(const void *raw, size_t size, int flags);
+
+static inline bool ERRNO_IS_NO_PLYMOUTH(int r) {
+ return IN_SET(abs(r), EAGAIN, ENOENT) || ERRNO_IS_DISCONNECT(r);
+}
diff --git a/src/shared/pretty-print.c b/src/shared/pretty-print.c
new file mode 100644
index 0000000..2833063
--- /dev/null
+++ b/src/shared/pretty-print.c
@@ -0,0 +1,421 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/utsname.h>
+#include <errno.h>
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "conf-files.h"
+#include "constants.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "pager.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "string-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+
+void draw_cylon(char buffer[], size_t buflen, unsigned width, unsigned pos) {
+ char *p = buffer;
+
+ assert(buflen >= CYLON_BUFFER_EXTRA + width + 1);
+ assert(pos <= width+1); /* 0 or width+1 mean that the center light is behind the corner */
+
+ if (pos > 1) {
+ if (pos > 2)
+ p = mempset(p, ' ', pos-2);
+ if (log_get_show_color())
+ p = stpcpy(p, ANSI_RED);
+ *p++ = '*';
+ }
+
+ if (pos > 0 && pos <= width) {
+ if (log_get_show_color())
+ p = stpcpy(p, ANSI_HIGHLIGHT_RED);
+ *p++ = '*';
+ }
+
+ if (log_get_show_color())
+ p = stpcpy(p, ANSI_NORMAL);
+
+ if (pos < width) {
+ if (log_get_show_color())
+ p = stpcpy(p, ANSI_RED);
+ *p++ = '*';
+ if (pos < width-1)
+ p = mempset(p, ' ', width-1-pos);
+ if (log_get_show_color())
+ p = stpcpy(p, ANSI_NORMAL);
+ }
+
+ *p = '\0';
+}
+
+bool urlify_enabled(void) {
+#if ENABLE_URLIFY
+ static int cached_urlify_enabled = -1;
+
+ if (cached_urlify_enabled < 0) {
+ int val;
+
+ val = getenv_bool("SYSTEMD_URLIFY");
+ if (val >= 0)
+ cached_urlify_enabled = val;
+ else
+ cached_urlify_enabled = colors_enabled();
+ }
+
+ return cached_urlify_enabled;
+#else
+ return 0;
+#endif
+}
+
+int terminal_urlify(const char *url, const char *text, char **ret) {
+ char *n;
+
+ assert(url);
+
+ /* Takes a URL and a pretty string and formats it as clickable link for the terminal. See
+ * https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda for details. */
+
+ if (isempty(text))
+ text = url;
+
+ if (urlify_enabled())
+ n = strjoin("\x1B]8;;", url, "\a", text, "\x1B]8;;\a");
+ else
+ n = strdup(text);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+int file_url_from_path(const char *path, char **ret) {
+ _cleanup_free_ char *absolute = NULL;
+ struct utsname u;
+ char *url = NULL;
+ int r;
+
+ if (uname(&u) < 0)
+ return -errno;
+
+ if (!path_is_absolute(path)) {
+ r = path_make_absolute_cwd(path, &absolute);
+ if (r < 0)
+ return r;
+
+ path = absolute;
+ }
+
+ /* As suggested by https://gist.github.com/egmontkob/eb114294efbcd5adb1944c9f3cb5feda, let's include the local
+ * hostname here. Note that we don't use gethostname_malloc() or gethostname_strict() since we are interested
+ * in the raw string the kernel has set, whatever it may be, under the assumption that terminals are not overly
+ * careful with validating the strings either. */
+
+ url = strjoin("file://", u.nodename, path);
+ if (!url)
+ return -ENOMEM;
+
+ *ret = url;
+ return 0;
+}
+
+int terminal_urlify_path(const char *path, const char *text, char **ret) {
+ _cleanup_free_ char *url = NULL;
+ int r;
+
+ assert(path);
+
+ /* Much like terminal_urlify() above, but takes a file system path as input
+ * and turns it into a proper file:// URL first. */
+
+ if (isempty(path))
+ return -EINVAL;
+
+ if (isempty(text))
+ text = path;
+
+ if (!urlify_enabled()) {
+ char *n;
+
+ n = strdup(text);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+ }
+
+ r = file_url_from_path(path, &url);
+ if (r < 0)
+ return r;
+
+ return terminal_urlify(url, text, ret);
+}
+
+int terminal_urlify_man(const char *page, const char *section, char **ret) {
+ const char *url, *text;
+
+ url = strjoina("man:", page, "(", section, ")");
+ text = strjoina(page, "(", section, ") man page");
+
+ return terminal_urlify(url, text, ret);
+}
+
+typedef enum {
+ LINE_SECTION,
+ LINE_COMMENT,
+ LINE_NORMAL,
+} LineType;
+
+static LineType classify_line_type(const char *line, CatFlags flags) {
+ const char *t = skip_leading_chars(line, WHITESPACE);
+
+ if ((flags & CAT_FORMAT_HAS_SECTIONS) && *t == '[')
+ return LINE_SECTION;
+ if (IN_SET(*t, '#', ';', '\0'))
+ return LINE_COMMENT;
+ return LINE_NORMAL;
+}
+
+static int cat_file(const char *filename, bool newline, CatFlags flags) {
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *urlified = NULL, *section = NULL, *old_section = NULL;
+ int r;
+
+ f = fopen(filename, "re");
+ if (!f)
+ return -errno;
+
+ r = terminal_urlify_path(filename, NULL, &urlified);
+ if (r < 0)
+ return r;
+
+ printf("%s%s# %s%s\n",
+ newline ? "\n" : "",
+ ansi_highlight_blue(),
+ urlified,
+ ansi_normal());
+ fflush(stdout);
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read \"%s\": %m", filename);
+ if (r == 0)
+ break;
+
+ LineType line_type = classify_line_type(line, flags);
+ if (flags & CAT_TLDR) {
+ if (line_type == LINE_SECTION) {
+ /* The start of a section, let's not print it yet. */
+ free_and_replace(section, line);
+ continue;
+ }
+
+ if (line_type == LINE_COMMENT)
+ continue;
+
+ /* Before we print the actual line, print the last section header */
+ if (section) {
+ /* Do not print redundant section headers */
+ if (!streq_ptr(section, old_section))
+ printf("%s%s%s\n",
+ ansi_highlight_cyan(),
+ section,
+ ansi_normal());
+
+ free_and_replace(old_section, section);
+ }
+ }
+
+ printf("%s%s%s\n",
+ line_type == LINE_SECTION ? ansi_highlight_cyan() :
+ line_type == LINE_COMMENT ? ansi_highlight_grey() :
+ "",
+ line,
+ line_type != LINE_NORMAL ? ansi_normal() : "");
+ }
+
+ return 0;
+}
+
+int cat_files(const char *file, char **dropins, CatFlags flags) {
+ int r;
+
+ if (file) {
+ r = cat_file(file, /* newline= */ false, flags);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to cat %s: %m", file);
+ }
+
+ STRV_FOREACH(path, dropins) {
+ r = cat_file(*path, /* newline= */ file || path != dropins, flags);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to cat %s: %m", *path);
+ }
+
+ return 0;
+}
+
+void print_separator(void) {
+
+ /* Outputs a separator line that resolves to whitespace when copied from the terminal. We do that by outputting
+ * one line filled with spaces with ANSI underline set, followed by a second (empty) line. */
+
+ if (underline_enabled()) {
+ size_t i, c;
+
+ c = columns();
+
+ flockfile(stdout);
+ fputs_unlocked(ANSI_UNDERLINE, stdout);
+
+ for (i = 0; i < c; i++)
+ fputc_unlocked(' ', stdout);
+
+ fputs_unlocked(ANSI_NORMAL "\n\n", stdout);
+ funlockfile(stdout);
+ } else
+ fputs("\n\n", stdout);
+}
+
+static int guess_type(const char **name, char ***prefixes, bool *is_collection, const char **extension) {
+ /* Try to figure out if name is like tmpfiles.d/ or systemd/system-presets/,
+ * i.e. a collection of directories without a main config file.
+ * Incidentally, all those formats don't use sections. So we return a single
+ * is_collection boolean, which also means that the format doesn't use sections.
+ */
+
+ _cleanup_free_ char *n = NULL;
+ bool usr = false, run = false, coll = false;
+ const char *ext = ".conf";
+ /* This is static so that the array doesn't get deallocated when we exit the function */
+ static const char* const std_prefixes[] = { CONF_PATHS(""), NULL };
+ static const char* const usr_prefixes[] = { CONF_PATHS_USR(""), NULL };
+ static const char* const run_prefixes[] = { "/run/", NULL };
+
+ if (path_equal(*name, "environment.d"))
+ /* Special case: we need to include /etc/environment in the search path, even
+ * though the whole concept is called environment.d. */
+ *name = "environment";
+
+ n = strdup(*name);
+ if (!n)
+ return log_oom();
+
+ /* All systemd-style config files should support the /usr-/etc-/run split and
+ * dropins. Let's add a blanket rule that allows us to support them without keeping
+ * an explicit list. */
+ if (path_startswith(n, "systemd") && endswith(n, ".conf"))
+ usr = true;
+
+ delete_trailing_chars(n, "/");
+
+ if (endswith(n, ".d"))
+ coll = true;
+
+ if (path_equal(n, "environment"))
+ usr = true;
+
+ if (path_equal(n, "udev/hwdb.d"))
+ ext = ".hwdb";
+
+ if (path_equal(n, "udev/rules.d"))
+ ext = ".rules";
+
+ if (path_equal(n, "kernel/install.d"))
+ ext = ".install";
+
+ if (path_equal(n, "systemd/ntp-units.d")) {
+ coll = true;
+ ext = ".list";
+ }
+
+ if (path_equal(n, "systemd/relabel-extra.d")) {
+ coll = run = true;
+ ext = ".relabel";
+ }
+
+ if (PATH_IN_SET(n, "systemd/system-preset", "systemd/user-preset")) {
+ coll = true;
+ ext = ".preset";
+ }
+
+ if (path_equal(n, "systemd/user-preset"))
+ usr = true;
+
+ *prefixes = (char**) (usr ? usr_prefixes : run ? run_prefixes : std_prefixes);
+ *is_collection = coll;
+ *extension = ext;
+ return 0;
+}
+
+int conf_files_cat(const char *root, const char *name, CatFlags flags) {
+ _cleanup_strv_free_ char **dirs = NULL, **files = NULL;
+ _cleanup_free_ char *path = NULL;
+ char **prefixes = NULL; /* explicit initialization to appease gcc */
+ bool is_collection;
+ const char *extension;
+ int r;
+
+ r = guess_type(&name, &prefixes, &is_collection, &extension);
+ if (r < 0)
+ return r;
+ assert(prefixes);
+ assert(extension);
+
+ STRV_FOREACH(prefix, prefixes) {
+ assert(endswith(*prefix, "/"));
+ r = strv_extendf(&dirs, "%s%s%s", *prefix, name,
+ is_collection ? "" : ".d");
+ if (r < 0)
+ return log_error_errno(r, "Failed to build directory list: %m");
+ }
+
+ if (DEBUG_LOGGING) {
+ log_debug("Looking for configuration in:");
+ if (!is_collection)
+ STRV_FOREACH(prefix, prefixes)
+ log_debug(" %s%s%s", strempty(root), *prefix, name);
+
+ STRV_FOREACH(t, dirs)
+ log_debug(" %s%s/*%s", strempty(root), *t, extension);
+ }
+
+ /* First locate the main config file, if any */
+ if (!is_collection) {
+ STRV_FOREACH(prefix, prefixes) {
+ path = path_join(root, *prefix, name);
+ if (!path)
+ return log_oom();
+ if (access(path, F_OK) == 0)
+ break;
+ path = mfree(path);
+ }
+
+ if (!path)
+ printf("%s# Main configuration file %s not found%s\n",
+ ansi_highlight_magenta(),
+ name,
+ ansi_normal());
+ }
+
+ /* Then locate the drop-ins, if any */
+ r = conf_files_list_strv(&files, extension, root, 0, (const char* const*) dirs);
+ if (r < 0)
+ return log_error_errno(r, "Failed to query file list: %m");
+
+ /* Show */
+ if (is_collection)
+ flags |= CAT_FORMAT_HAS_SECTIONS;
+
+ return cat_files(path, files, flags);
+}
diff --git a/src/shared/pretty-print.h b/src/shared/pretty-print.h
new file mode 100644
index 0000000..c17e976
--- /dev/null
+++ b/src/shared/pretty-print.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "glyph-util.h"
+#include "terminal-util.h"
+
+#define CYLON_BUFFER_EXTRA (2*STRLEN(ANSI_RED) + STRLEN(ANSI_HIGHLIGHT_RED) + 2*STRLEN(ANSI_NORMAL))
+
+void draw_cylon(char buffer[], size_t buflen, unsigned width, unsigned pos);
+
+void print_separator(void);
+
+int file_url_from_path(const char *path, char **ret);
+
+bool urlify_enabled(void);
+
+int terminal_urlify(const char *url, const char *text, char **ret);
+int terminal_urlify_path(const char *path, const char *text, char **ret);
+int terminal_urlify_man(const char *page, const char *section, char **ret);
+
+typedef enum CatFlags {
+ CAT_CONFIG_OFF = 0,
+ CAT_CONFIG_ON = 1 << 0,
+ CAT_FORMAT_HAS_SECTIONS = 1 << 1, /* Sections are meaningful for this file format */
+ CAT_TLDR = 1 << 2, /* Only print comments and relevant section headers */
+} CatFlags;
+
+int cat_files(const char *file, char **dropins, CatFlags flags);
+int conf_files_cat(const char *root, const char *name, CatFlags flags);
+
+#define RED_CROSS_MARK_MAX (STRLEN(ANSI_HIGHLIGHT_RED) + STRLEN("✗") + STRLEN(ANSI_NORMAL) + 1)
+#define GREEN_CHECK_MARK_MAX (STRLEN(ANSI_HIGHLIGHT_GREEN) + STRLEN("✓") + STRLEN(ANSI_NORMAL) + 1)
+
+static inline const char *red_cross_mark_internal(char buffer[static RED_CROSS_MARK_MAX]) {
+ assert(buffer);
+ assert_se(stpcpy(stpcpy(stpcpy(buffer, ansi_highlight_red()), special_glyph(SPECIAL_GLYPH_CROSS_MARK)), ansi_normal()) < buffer + RED_CROSS_MARK_MAX);
+ return buffer;
+}
+
+static inline const char *green_check_mark_internal(char buffer[static GREEN_CHECK_MARK_MAX]) {
+ assert(buffer);
+ assert_se(stpcpy(stpcpy(stpcpy(buffer, ansi_highlight_green()), special_glyph(SPECIAL_GLYPH_CHECK_MARK)), ansi_normal()) < buffer + GREEN_CHECK_MARK_MAX);
+ return buffer;
+}
+
+#define RED_CROSS_MARK() red_cross_mark_internal((char[RED_CROSS_MARK_MAX]) {})
+#define GREEN_CHECK_MARK() green_check_mark_internal((char[GREEN_CHECK_MARK_MAX]) {})
+
+#define COLOR_MARK_BOOL(b) ((b) ? GREEN_CHECK_MARK() : RED_CROSS_MARK())
diff --git a/src/shared/ptyfwd.c b/src/shared/ptyfwd.c
new file mode 100644
index 0000000..195e603
--- /dev/null
+++ b/src/shared/ptyfwd.c
@@ -0,0 +1,677 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include "sd-event.h"
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "log.h"
+#include "macro.h"
+#include "ptyfwd.h"
+#include "terminal-util.h"
+#include "time-util.h"
+
+struct PTYForward {
+ sd_event *event;
+
+ int input_fd;
+ int output_fd;
+ int master;
+
+ PTYForwardFlags flags;
+
+ sd_event_source *stdin_event_source;
+ sd_event_source *stdout_event_source;
+ sd_event_source *master_event_source;
+
+ sd_event_source *sigwinch_event_source;
+
+ struct termios saved_stdin_attr;
+ struct termios saved_stdout_attr;
+
+ bool close_input_fd:1;
+ bool close_output_fd:1;
+
+ bool saved_stdin:1;
+ bool saved_stdout:1;
+
+ bool stdin_readable:1;
+ bool stdin_hangup:1;
+ bool stdout_writable:1;
+ bool stdout_hangup:1;
+ bool master_readable:1;
+ bool master_writable:1;
+ bool master_hangup:1;
+
+ bool read_from_master:1;
+
+ bool done:1;
+ bool drain:1;
+
+ bool last_char_set:1;
+ char last_char;
+
+ char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
+ size_t in_buffer_full, out_buffer_full;
+
+ usec_t escape_timestamp;
+ unsigned escape_counter;
+
+ PTYForwardHandler handler;
+ void *userdata;
+};
+
+#define ESCAPE_USEC (1*USEC_PER_SEC)
+
+static void pty_forward_disconnect(PTYForward *f) {
+
+ if (!f)
+ return;
+
+ f->stdin_event_source = sd_event_source_unref(f->stdin_event_source);
+ f->stdout_event_source = sd_event_source_unref(f->stdout_event_source);
+
+ f->master_event_source = sd_event_source_unref(f->master_event_source);
+ f->sigwinch_event_source = sd_event_source_unref(f->sigwinch_event_source);
+ f->event = sd_event_unref(f->event);
+
+ if (f->output_fd >= 0) {
+ if (f->saved_stdout)
+ (void) tcsetattr(f->output_fd, TCSANOW, &f->saved_stdout_attr);
+
+ /* STDIN/STDOUT should not be non-blocking normally, so let's reset it */
+ (void) fd_nonblock(f->output_fd, false);
+ if (f->close_output_fd)
+ f->output_fd = safe_close(f->output_fd);
+ }
+
+ if (f->input_fd >= 0) {
+ if (f->saved_stdin)
+ (void) tcsetattr(f->input_fd, TCSANOW, &f->saved_stdin_attr);
+
+ (void) fd_nonblock(f->input_fd, false);
+ if (f->close_input_fd)
+ f->input_fd = safe_close(f->input_fd);
+ }
+
+ f->saved_stdout = f->saved_stdin = false;
+}
+
+static int pty_forward_done(PTYForward *f, int rcode) {
+ _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+ assert(f);
+
+ if (f->done)
+ return 0;
+
+ e = sd_event_ref(f->event);
+
+ f->done = true;
+ pty_forward_disconnect(f);
+
+ if (f->handler)
+ return f->handler(f, rcode, f->userdata);
+ else
+ return sd_event_exit(e, rcode < 0 ? EXIT_FAILURE : rcode);
+}
+
+static bool look_for_escape(PTYForward *f, const char *buffer, size_t n) {
+ const char *p;
+
+ assert(f);
+ assert(buffer);
+ assert(n > 0);
+
+ for (p = buffer; p < buffer + n; p++) {
+
+ /* Check for ^] */
+ if (*p == 0x1D) {
+ usec_t nw = now(CLOCK_MONOTONIC);
+
+ if (f->escape_counter == 0 || nw > f->escape_timestamp + ESCAPE_USEC) {
+ f->escape_timestamp = nw;
+ f->escape_counter = 1;
+ } else {
+ (f->escape_counter)++;
+
+ if (f->escape_counter >= 3)
+ return true;
+ }
+ } else {
+ f->escape_timestamp = 0;
+ f->escape_counter = 0;
+ }
+ }
+
+ return false;
+}
+
+static bool ignore_vhangup(PTYForward *f) {
+ assert(f);
+
+ if (f->flags & PTY_FORWARD_IGNORE_VHANGUP)
+ return true;
+
+ if ((f->flags & PTY_FORWARD_IGNORE_INITIAL_VHANGUP) && !f->read_from_master)
+ return true;
+
+ return false;
+}
+
+static bool drained(PTYForward *f) {
+ int q = 0;
+
+ assert(f);
+
+ if (f->out_buffer_full > 0)
+ return false;
+
+ if (f->master_readable)
+ return false;
+
+ if (ioctl(f->master, TIOCINQ, &q) < 0)
+ log_debug_errno(errno, "TIOCINQ failed on master: %m");
+ else if (q > 0)
+ return false;
+
+ if (ioctl(f->master, TIOCOUTQ, &q) < 0)
+ log_debug_errno(errno, "TIOCOUTQ failed on master: %m");
+ else if (q > 0)
+ return false;
+
+ return true;
+}
+
+static int shovel(PTYForward *f) {
+ ssize_t k;
+
+ assert(f);
+
+ while ((f->stdin_readable && f->in_buffer_full <= 0) ||
+ (f->master_writable && f->in_buffer_full > 0) ||
+ (f->master_readable && f->out_buffer_full <= 0) ||
+ (f->stdout_writable && f->out_buffer_full > 0)) {
+
+ if (f->stdin_readable && f->in_buffer_full < LINE_MAX) {
+
+ k = read(f->input_fd, f->in_buffer + f->in_buffer_full, LINE_MAX - f->in_buffer_full);
+ if (k < 0) {
+
+ if (errno == EAGAIN)
+ f->stdin_readable = false;
+ else if (errno == EIO || ERRNO_IS_DISCONNECT(errno)) {
+ f->stdin_readable = false;
+ f->stdin_hangup = true;
+
+ f->stdin_event_source = sd_event_source_unref(f->stdin_event_source);
+ } else {
+ log_error_errno(errno, "read(): %m");
+ return pty_forward_done(f, -errno);
+ }
+ } else if (k == 0) {
+ /* EOF on stdin */
+ f->stdin_readable = false;
+ f->stdin_hangup = true;
+
+ f->stdin_event_source = sd_event_source_unref(f->stdin_event_source);
+ } else {
+ /* Check if ^] has been pressed three times within one second. If we get this we quite
+ * immediately. */
+ if (look_for_escape(f, f->in_buffer + f->in_buffer_full, k))
+ return pty_forward_done(f, -ECANCELED);
+
+ f->in_buffer_full += (size_t) k;
+ }
+ }
+
+ if (f->master_writable && f->in_buffer_full > 0) {
+
+ k = write(f->master, f->in_buffer, f->in_buffer_full);
+ if (k < 0) {
+
+ if (IN_SET(errno, EAGAIN, EIO))
+ f->master_writable = false;
+ else if (IN_SET(errno, EPIPE, ECONNRESET)) {
+ f->master_writable = f->master_readable = false;
+ f->master_hangup = true;
+
+ f->master_event_source = sd_event_source_unref(f->master_event_source);
+ } else {
+ log_error_errno(errno, "write(): %m");
+ return pty_forward_done(f, -errno);
+ }
+ } else {
+ assert(f->in_buffer_full >= (size_t) k);
+ memmove(f->in_buffer, f->in_buffer + k, f->in_buffer_full - k);
+ f->in_buffer_full -= k;
+ }
+ }
+
+ if (f->master_readable && f->out_buffer_full < LINE_MAX) {
+
+ k = read(f->master, f->out_buffer + f->out_buffer_full, LINE_MAX - f->out_buffer_full);
+ if (k < 0) {
+
+ /* Note that EIO on the master device
+ * might be caused by vhangup() or
+ * temporary closing of everything on
+ * the other side, we treat it like
+ * EAGAIN here and try again, unless
+ * ignore_vhangup is off. */
+
+ if (errno == EAGAIN || (errno == EIO && ignore_vhangup(f)))
+ f->master_readable = false;
+ else if (IN_SET(errno, EPIPE, ECONNRESET, EIO)) {
+ f->master_readable = f->master_writable = false;
+ f->master_hangup = true;
+
+ f->master_event_source = sd_event_source_unref(f->master_event_source);
+ } else {
+ log_error_errno(errno, "read(): %m");
+ return pty_forward_done(f, -errno);
+ }
+ } else {
+ f->read_from_master = true;
+ f->out_buffer_full += (size_t) k;
+ }
+ }
+
+ if (f->stdout_writable && f->out_buffer_full > 0) {
+
+ k = write(f->output_fd, f->out_buffer, f->out_buffer_full);
+ if (k < 0) {
+
+ if (errno == EAGAIN)
+ f->stdout_writable = false;
+ else if (errno == EIO || ERRNO_IS_DISCONNECT(errno)) {
+ f->stdout_writable = false;
+ f->stdout_hangup = true;
+ f->stdout_event_source = sd_event_source_unref(f->stdout_event_source);
+ } else {
+ log_error_errno(errno, "write(): %m");
+ return pty_forward_done(f, -errno);
+ }
+
+ } else {
+
+ if (k > 0) {
+ f->last_char = f->out_buffer[k-1];
+ f->last_char_set = true;
+ }
+
+ assert(f->out_buffer_full >= (size_t) k);
+ memmove(f->out_buffer, f->out_buffer + k, f->out_buffer_full - k);
+ f->out_buffer_full -= k;
+ }
+ }
+ }
+
+ if (f->stdin_hangup || f->stdout_hangup || f->master_hangup) {
+ /* Exit the loop if any side hung up and if there's
+ * nothing more to write or nothing we could write. */
+
+ if ((f->out_buffer_full <= 0 || f->stdout_hangup) &&
+ (f->in_buffer_full <= 0 || f->master_hangup))
+ return pty_forward_done(f, 0);
+ }
+
+ /* If we were asked to drain, and there's nothing more to handle from the master, then call the callback
+ * too. */
+ if (f->drain && drained(f))
+ return pty_forward_done(f, 0);
+
+ return 0;
+}
+
+static int on_master_event(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+ PTYForward *f = ASSERT_PTR(userdata);
+
+ assert(e);
+ assert(e == f->master_event_source);
+ assert(fd >= 0);
+ assert(fd == f->master);
+
+ if (revents & (EPOLLIN|EPOLLHUP))
+ f->master_readable = true;
+
+ if (revents & (EPOLLOUT|EPOLLHUP))
+ f->master_writable = true;
+
+ return shovel(f);
+}
+
+static int on_stdin_event(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+ PTYForward *f = ASSERT_PTR(userdata);
+
+ assert(e);
+ assert(e == f->stdin_event_source);
+ assert(fd >= 0);
+ assert(fd == f->input_fd);
+
+ if (revents & (EPOLLIN|EPOLLHUP))
+ f->stdin_readable = true;
+
+ return shovel(f);
+}
+
+static int on_stdout_event(sd_event_source *e, int fd, uint32_t revents, void *userdata) {
+ PTYForward *f = ASSERT_PTR(userdata);
+
+ assert(e);
+ assert(e == f->stdout_event_source);
+ assert(fd >= 0);
+ assert(fd == f->output_fd);
+
+ if (revents & (EPOLLOUT|EPOLLHUP))
+ f->stdout_writable = true;
+
+ return shovel(f);
+}
+
+static int on_sigwinch_event(sd_event_source *e, const struct signalfd_siginfo *si, void *userdata) {
+ PTYForward *f = ASSERT_PTR(userdata);
+ struct winsize ws;
+
+ assert(e);
+ assert(e == f->sigwinch_event_source);
+
+ /* The window size changed, let's forward that. */
+ if (ioctl(f->output_fd, TIOCGWINSZ, &ws) >= 0)
+ (void) ioctl(f->master, TIOCSWINSZ, &ws);
+
+ return 0;
+}
+
+int pty_forward_new(
+ sd_event *event,
+ int master,
+ PTYForwardFlags flags,
+ PTYForward **ret) {
+
+ _cleanup_(pty_forward_freep) PTYForward *f = NULL;
+ struct winsize ws;
+ int r;
+
+ f = new(PTYForward, 1);
+ if (!f)
+ return -ENOMEM;
+
+ *f = (struct PTYForward) {
+ .flags = flags,
+ .master = -EBADF,
+ .input_fd = -EBADF,
+ .output_fd = -EBADF,
+ };
+
+ if (event)
+ f->event = sd_event_ref(event);
+ else {
+ r = sd_event_default(&f->event);
+ if (r < 0)
+ return r;
+ }
+
+ if (FLAGS_SET(flags, PTY_FORWARD_READ_ONLY))
+ f->output_fd = STDOUT_FILENO;
+ else {
+ /* If we shall be invoked in interactive mode, let's switch on non-blocking mode, so that we
+ * never end up staving one direction while we block on the other. However, let's be careful
+ * here and not turn on O_NONBLOCK for stdin/stdout directly, but of re-opened copies of
+ * them. This has two advantages: when we are killed abruptly the stdin/stdout fds won't be
+ * left in O_NONBLOCK state for the next process using them. In addition, if some process
+ * running in the background wants to continue writing to our stdout it can do so without
+ * being confused by O_NONBLOCK. */
+
+ f->input_fd = fd_reopen(STDIN_FILENO, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+ if (f->input_fd < 0) {
+ /* Handle failures gracefully, after all certain fd types cannot be reopened
+ * (sockets, …) */
+ log_debug_errno(f->input_fd, "Failed to reopen stdin, using original fd: %m");
+
+ r = fd_nonblock(STDIN_FILENO, true);
+ if (r < 0)
+ return r;
+
+ f->input_fd = STDIN_FILENO;
+ } else
+ f->close_input_fd = true;
+
+ f->output_fd = fd_reopen(STDOUT_FILENO, O_WRONLY|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
+ if (f->output_fd < 0) {
+ log_debug_errno(f->output_fd, "Failed to reopen stdout, using original fd: %m");
+
+ r = fd_nonblock(STDOUT_FILENO, true);
+ if (r < 0)
+ return r;
+
+ f->output_fd = STDOUT_FILENO;
+ } else
+ f->close_output_fd = true;
+ }
+
+ r = fd_nonblock(master, true);
+ if (r < 0)
+ return r;
+
+ f->master = master;
+
+ if (ioctl(f->output_fd, TIOCGWINSZ, &ws) < 0)
+ /* If we can't get the resolution from the output fd, then use our internal, regular width/height,
+ * i.e. something derived from $COLUMNS and $LINES if set. */
+ ws = (struct winsize) {
+ .ws_row = lines(),
+ .ws_col = columns(),
+ };
+
+ (void) ioctl(master, TIOCSWINSZ, &ws);
+
+ if (!(flags & PTY_FORWARD_READ_ONLY)) {
+ assert(f->input_fd >= 0);
+
+ if (tcgetattr(f->input_fd, &f->saved_stdin_attr) >= 0) {
+ struct termios raw_stdin_attr;
+
+ f->saved_stdin = true;
+
+ raw_stdin_attr = f->saved_stdin_attr;
+ cfmakeraw(&raw_stdin_attr);
+ raw_stdin_attr.c_oflag = f->saved_stdin_attr.c_oflag;
+ tcsetattr(f->input_fd, TCSANOW, &raw_stdin_attr);
+ }
+
+ if (tcgetattr(f->output_fd, &f->saved_stdout_attr) >= 0) {
+ struct termios raw_stdout_attr;
+
+ f->saved_stdout = true;
+
+ raw_stdout_attr = f->saved_stdout_attr;
+ cfmakeraw(&raw_stdout_attr);
+ raw_stdout_attr.c_iflag = f->saved_stdout_attr.c_iflag;
+ raw_stdout_attr.c_lflag = f->saved_stdout_attr.c_lflag;
+ tcsetattr(f->output_fd, TCSANOW, &raw_stdout_attr);
+ }
+
+ r = sd_event_add_io(f->event, &f->stdin_event_source, f->input_fd, EPOLLIN|EPOLLET, on_stdin_event, f);
+ if (r < 0 && r != -EPERM)
+ return r;
+
+ if (r >= 0)
+ (void) sd_event_source_set_description(f->stdin_event_source, "ptyfwd-stdin");
+ }
+
+ r = sd_event_add_io(f->event, &f->stdout_event_source, f->output_fd, EPOLLOUT|EPOLLET, on_stdout_event, f);
+ if (r == -EPERM)
+ /* stdout without epoll support. Likely redirected to regular file. */
+ f->stdout_writable = true;
+ else if (r < 0)
+ return r;
+ else
+ (void) sd_event_source_set_description(f->stdout_event_source, "ptyfwd-stdout");
+
+ r = sd_event_add_io(f->event, &f->master_event_source, master, EPOLLIN|EPOLLOUT|EPOLLET, on_master_event, f);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(f->master_event_source, "ptyfwd-master");
+
+ r = sd_event_add_signal(f->event, &f->sigwinch_event_source, SIGWINCH, on_sigwinch_event, f);
+ if (r < 0)
+ return r;
+
+ (void) sd_event_source_set_description(f->sigwinch_event_source, "ptyfwd-sigwinch");
+
+ *ret = TAKE_PTR(f);
+
+ return 0;
+}
+
+PTYForward *pty_forward_free(PTYForward *f) {
+ pty_forward_disconnect(f);
+ return mfree(f);
+}
+
+int pty_forward_get_last_char(PTYForward *f, char *ch) {
+ assert(f);
+ assert(ch);
+
+ if (!f->last_char_set)
+ return -ENXIO;
+
+ *ch = f->last_char;
+ return 0;
+}
+
+int pty_forward_set_ignore_vhangup(PTYForward *f, bool b) {
+ int r;
+
+ assert(f);
+
+ if (!!(f->flags & PTY_FORWARD_IGNORE_VHANGUP) == b)
+ return 0;
+
+ SET_FLAG(f->flags, PTY_FORWARD_IGNORE_VHANGUP, b);
+
+ if (!ignore_vhangup(f)) {
+
+ /* We shall now react to vhangup()s? Let's check
+ * immediately if we might be in one */
+
+ f->master_readable = true;
+ r = shovel(f);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+bool pty_forward_get_ignore_vhangup(PTYForward *f) {
+ assert(f);
+
+ return !!(f->flags & PTY_FORWARD_IGNORE_VHANGUP);
+}
+
+bool pty_forward_is_done(PTYForward *f) {
+ assert(f);
+
+ return f->done;
+}
+
+void pty_forward_set_handler(PTYForward *f, PTYForwardHandler cb, void *userdata) {
+ assert(f);
+
+ f->handler = cb;
+ f->userdata = userdata;
+}
+
+bool pty_forward_drain(PTYForward *f) {
+ assert(f);
+
+ /* Starts draining the forwarder. Specifically:
+ *
+ * - Returns true if there are no unprocessed bytes from the pty, false otherwise
+ *
+ * - Makes sure the handler function is called the next time the number of unprocessed bytes hits zero
+ */
+
+ f->drain = true;
+ return drained(f);
+}
+
+int pty_forward_set_priority(PTYForward *f, int64_t priority) {
+ int r;
+ assert(f);
+
+ if (f->stdin_event_source) {
+ r = sd_event_source_set_priority(f->stdin_event_source, priority);
+ if (r < 0)
+ return r;
+ }
+
+ r = sd_event_source_set_priority(f->stdout_event_source, priority);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(f->master_event_source, priority);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(f->sigwinch_event_source, priority);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int pty_forward_set_width_height(PTYForward *f, unsigned width, unsigned height) {
+ struct winsize ws;
+
+ assert(f);
+
+ if (width == UINT_MAX && height == UINT_MAX)
+ return 0; /* noop */
+
+ if (width != UINT_MAX &&
+ (width == 0 || width > USHRT_MAX))
+ return -ERANGE;
+
+ if (height != UINT_MAX &&
+ (height == 0 || height > USHRT_MAX))
+ return -ERANGE;
+
+ if (width == UINT_MAX || height == UINT_MAX) {
+ if (ioctl(f->master, TIOCGWINSZ, &ws) < 0)
+ return -errno;
+
+ if (width != UINT_MAX)
+ ws.ws_col = width;
+ if (height != UINT_MAX)
+ ws.ws_row = height;
+ } else
+ ws = (struct winsize) {
+ .ws_row = height,
+ .ws_col = width,
+ };
+
+ if (ioctl(f->master, TIOCSWINSZ, &ws) < 0)
+ return -errno;
+
+ /* Make sure we ignore SIGWINCH window size events from now on */
+ f->sigwinch_event_source = sd_event_source_unref(f->sigwinch_event_source);
+
+ return 0;
+}
diff --git a/src/shared/ptyfwd.h b/src/shared/ptyfwd.h
new file mode 100644
index 0000000..f0ae6e9
--- /dev/null
+++ b/src/shared/ptyfwd.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-event.h"
+
+#include "macro.h"
+
+typedef struct PTYForward PTYForward;
+
+typedef enum PTYForwardFlags {
+ PTY_FORWARD_READ_ONLY = 1,
+
+ /* Continue reading after hangup? */
+ PTY_FORWARD_IGNORE_VHANGUP = 2,
+
+ /* Continue reading after hangup but only if we never read anything else? */
+ PTY_FORWARD_IGNORE_INITIAL_VHANGUP = 4,
+} PTYForwardFlags;
+
+typedef int (*PTYForwardHandler)(PTYForward *f, int rcode, void *userdata);
+
+int pty_forward_new(sd_event *event, int master, PTYForwardFlags flags, PTYForward **f);
+PTYForward *pty_forward_free(PTYForward *f);
+
+int pty_forward_get_last_char(PTYForward *f, char *ch);
+
+int pty_forward_set_ignore_vhangup(PTYForward *f, bool ignore_vhangup);
+bool pty_forward_get_ignore_vhangup(PTYForward *f);
+
+bool pty_forward_is_done(PTYForward *f);
+
+void pty_forward_set_handler(PTYForward *f, PTYForwardHandler handler, void *userdata);
+
+bool pty_forward_drain(PTYForward *f);
+
+int pty_forward_set_priority(PTYForward *f, int64_t priority);
+
+int pty_forward_set_width_height(PTYForward *f, unsigned width, unsigned height);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(PTYForward*, pty_forward_free);
diff --git a/src/shared/qrcode-util.c b/src/shared/qrcode-util.c
new file mode 100644
index 0000000..b0dd90a
--- /dev/null
+++ b/src/shared/qrcode-util.c
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "qrcode-util.h"
+
+#if HAVE_QRENCODE
+#include <qrencode.h>
+
+#include "dlfcn-util.h"
+#include "locale-util.h"
+#include "log.h"
+#include "strv.h"
+#include "terminal-util.h"
+
+#define ANSI_WHITE_ON_BLACK "\033[40;37;1m"
+#define UNICODE_FULL_BLOCK u8"█"
+#define UNICODE_LOWER_HALF_BLOCK u8"▄"
+#define UNICODE_UPPER_HALF_BLOCK u8"▀"
+
+static void *qrcode_dl = NULL;
+
+static QRcode* (*sym_QRcode_encodeString)(const char *string, int version, QRecLevel level, QRencodeMode hint, int casesensitive) = NULL;
+static void (*sym_QRcode_free)(QRcode *qrcode) = NULL;
+
+int dlopen_qrencode(void) {
+ int r;
+
+ FOREACH_STRING(s, "libqrencode.so.4", "libqrencode.so.3") {
+ r = dlopen_many_sym_or_warn(
+ &qrcode_dl, s, LOG_DEBUG,
+ DLSYM_ARG(QRcode_encodeString),
+ DLSYM_ARG(QRcode_free));
+ if (r >= 0)
+ break;
+ }
+
+ return r;
+}
+
+static void print_border(FILE *output, unsigned width, unsigned row, unsigned column) {
+ assert(output);
+ assert(width);
+
+ if (row != UINT_MAX && column != UINT_MAX) {
+ int r, fd;
+
+ fd = fileno(output);
+ if (fd < 0)
+ return (void)log_debug_errno(errno, "Failed to get file descriptor from the file stream: %m");
+
+ r = set_terminal_cursor_position(fd, row, column);
+ if (r < 0)
+ log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m");
+
+ /* Four rows of border */
+ for (unsigned y = 0; y < 4; y += 2) {
+ fputs(ANSI_WHITE_ON_BLACK, output);
+
+ for (unsigned x = 0; x < 4 + width + 4; x++)
+ fputs(UNICODE_FULL_BLOCK, output);
+
+ fputs(ANSI_NORMAL "\n", output);
+ r = set_terminal_cursor_position(fd, row + 1, column);
+ if (r < 0)
+ log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m");
+ }
+ } else {
+ /* Four rows of border */
+ for (unsigned y = 0; y < 4; y += 2) {
+ fputs(ANSI_WHITE_ON_BLACK, output);
+
+ for (unsigned x = 0; x < 4 + width + 4; x++)
+ fputs(UNICODE_FULL_BLOCK, output);
+
+ fputs(ANSI_NORMAL "\n", output);
+ }
+ }
+}
+
+static void write_qrcode(FILE *output, QRcode *qr, unsigned int row, unsigned int column) {
+ assert(qr);
+
+ if (!output)
+ output = stdout;
+
+ print_border(output, qr->width, row, column);
+
+ if (row != UINT_MAX && column != UINT_MAX) {
+ /* After printing two rows of top border, we need to move the cursor down two rows before starting to print the actual QR code */
+ int r, fd, move_down = 2;
+ fd = fileno(output);
+ if (fd < 0)
+ return (void)log_debug_errno(errno, "Failed to get file descriptor from the file stream: %m");
+
+ r = set_terminal_cursor_position(fd, row + move_down, column);
+ if (r < 0)
+ log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m");
+
+ for (unsigned y = 0; y < (unsigned) qr->width; y += 2) {
+ const uint8_t *row1 = qr->data + qr->width * y;
+ const uint8_t *row2 = row1 + qr->width;
+
+ fputs(ANSI_WHITE_ON_BLACK, output);
+
+ for (unsigned x = 0; x < 4; x++)
+ fputs(UNICODE_FULL_BLOCK, output);
+
+ for (unsigned x = 0; x < (unsigned) qr->width; x++) {
+ bool a, b;
+
+ a = row1[x] & 1;
+ b = (y+1) < (unsigned) qr->width ? (row2[x] & 1) : false;
+
+ if (a && b)
+ fputc(' ', output);
+ else if (a)
+ fputs(UNICODE_LOWER_HALF_BLOCK, output);
+ else if (b)
+ fputs(UNICODE_UPPER_HALF_BLOCK, output);
+ else
+ fputs(UNICODE_FULL_BLOCK, output);
+ }
+
+ for (unsigned x = 0; x < 4; x++)
+ fputs(UNICODE_FULL_BLOCK, output);
+ r = set_terminal_cursor_position(fd, row + move_down, column);
+ if (r < 0)
+ log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m");
+ move_down += 1;
+ fputs(ANSI_NORMAL "\n", output);
+ }
+
+ print_border(output, qr->width, row + move_down, column);
+ } else {
+
+ for (unsigned y = 0; y < (unsigned) qr->width; y += 2) {
+ const uint8_t *row1 = qr->data + qr->width * y;
+ const uint8_t *row2 = row1 + qr->width;
+
+ fputs(ANSI_WHITE_ON_BLACK, output);
+ for (unsigned x = 0; x < 4; x++)
+ fputs(UNICODE_FULL_BLOCK, output);
+
+ for (unsigned x = 0; x < (unsigned) qr->width; x++) {
+ bool a, b;
+
+ a = row1[x] & 1;
+ b = (y+1) < (unsigned) qr->width ? (row2[x] & 1) : false;
+
+ if (a && b)
+ fputc(' ', output);
+ else if (a)
+ fputs(UNICODE_LOWER_HALF_BLOCK, output);
+ else if (b)
+ fputs(UNICODE_UPPER_HALF_BLOCK, output);
+ else
+ fputs(UNICODE_FULL_BLOCK, output);
+ }
+
+ for (unsigned x = 0; x < 4; x++)
+ fputs(UNICODE_FULL_BLOCK, output);
+ fputs(ANSI_NORMAL "\n", output);
+ }
+
+ print_border(output, qr->width, row, column);
+ }
+
+ fflush(output);
+}
+
+int print_qrcode_full(FILE *out, const char *header, const char *string, unsigned row, unsigned column, unsigned tty_width, unsigned tty_height) {
+ QRcode* qr;
+ int r;
+
+ /* If this is not a UTF-8 system or ANSI colors aren't supported/disabled don't print any QR
+ * codes */
+ if (!is_locale_utf8() || !colors_enabled())
+ return -EOPNOTSUPP;
+
+ r = dlopen_qrencode();
+ if (r < 0)
+ return r;
+
+ qr = sym_QRcode_encodeString(string, 0, QR_ECLEVEL_L, QR_MODE_8, 1);
+ if (!qr)
+ return -ENOMEM;
+
+ if (row != UINT_MAX && column != UINT_MAX) {
+ int fd;
+ unsigned qr_code_width, qr_code_height;
+ fd = fileno(out);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to get file descriptor from the file stream: %m");
+ qr_code_width = qr_code_height = qr->width + 8;
+
+ if (column + qr_code_width > tty_width)
+ column = tty_width - qr_code_width;
+
+ /* Terminal characters are twice as high as they are wide so it's qr_code_height / 2,
+ * our QR code prints an extra new line, so we have -1 as well */
+ if (row + qr_code_height > tty_height)
+ row = tty_height - (qr_code_height / 2 ) - 1;
+
+ if (header) {
+ r = set_terminal_cursor_position(fd, row - 2, tty_width - qr_code_width - 2);
+ if (r < 0)
+ log_warning_errno(r, "Failed to move terminal cursor position, ignoring: %m");
+
+ fprintf(out, "%s:\n\n", header);
+ }
+ } else
+ if (header)
+ fprintf(out, "\n%s:\n\n", header);
+
+ write_qrcode(out, qr, row, column);
+
+ fputc('\n', out);
+
+ sym_QRcode_free(qr);
+ return 0;
+}
+#endif
diff --git a/src/shared/qrcode-util.h b/src/shared/qrcode-util.h
new file mode 100644
index 0000000..ee58294
--- /dev/null
+++ b/src/shared/qrcode-util.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+#include <stdio.h>
+#include <errno.h>
+#include <limits.h>
+
+#if HAVE_QRENCODE
+int dlopen_qrencode(void);
+
+int print_qrcode_full(FILE *out, const char *header, const char *string, unsigned row, unsigned column, unsigned tty_width, unsigned tty_height);
+static inline int print_qrcode(FILE *out, const char *header, const char *string) {
+ return print_qrcode_full(out, header, string, UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+}
+#else
+static inline int print_qrcode_full(FILE *out, const char *header, const char *string, unsigned row, unsigned column, unsigned tty_width, unsigned tty_height) {
+ return -EOPNOTSUPP;
+}
+static inline int print_qrcode(FILE *out, const char *header, const char *string) {
+ return -EOPNOTSUPP;
+}
+#endif
diff --git a/src/shared/quota-util.c b/src/shared/quota-util.c
new file mode 100644
index 0000000..4d014f8
--- /dev/null
+++ b/src/shared/quota-util.c
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/quota.h>
+#include <sys/stat.h>
+
+#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "device-util.h"
+#include "quota-util.h"
+
+int quotactl_devnum(int cmd, dev_t devnum, int id, void *addr) {
+ _cleanup_free_ char *devnode = NULL;
+ int r;
+
+ /* Like quotactl() but takes a dev_t instead of a path to a device node, and fixes caddr_t → void*,
+ * like we should, today */
+
+ r = devname_from_devnum(S_IFBLK, devnum, &devnode);
+ if (r < 0)
+ return r;
+
+ if (quotactl(cmd, devnode, id, addr) < 0)
+ return -errno;
+
+ return 0;
+}
+
+int quotactl_path(int cmd, const char *path, int id, void *addr) {
+ dev_t devno;
+ int r;
+
+ /* Like quotactl() but takes a path to some fs object, and changes the backing file system. I.e. the
+ * argument shouldn't be a block device but a regular file system object */
+
+ r = get_block_device(path, &devno);
+ if (r < 0)
+ return r;
+ if (devno == 0) /* Doesn't have a block device */
+ return -ENODEV;
+
+ return quotactl_devnum(cmd, devno, id, addr);
+}
diff --git a/src/shared/quota-util.h b/src/shared/quota-util.h
new file mode 100644
index 0000000..14a390e
--- /dev/null
+++ b/src/shared/quota-util.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+#include <sys/quota.h>
+#include <sys/types.h>
+
+/* Wrapper around the QCMD() macro of linux/quota.h that removes some undefined behaviour. A typical quota
+ * command such as QCMD(Q_GETQUOTA, USRQUOTA) cannot be resolved on platforms where "int" is 32-bit, as it is
+ * larger than INT_MAX. Yikes, because that are basically all platforms Linux supports. Let's add a wrapper
+ * that explicitly takes its arguments as unsigned 32-bit, and then converts the shift result explicitly to
+ * int, acknowledging the undefined behaviour of the kernel headers. This doesn't remove the undefined
+ * behaviour, but it stops ubsan from complaining about it. */
+static inline int QCMD_FIXED(uint32_t cmd, uint32_t type) {
+ return (int) QCMD(cmd, type);
+}
+
+int quotactl_devnum(int cmd, dev_t devnum, int id, void *addr);
+int quotactl_path(int cmd, const char *path, int id, void *addr);
diff --git a/src/shared/reboot-util.c b/src/shared/reboot-util.c
new file mode 100644
index 0000000..62ff697
--- /dev/null
+++ b/src/shared/reboot-util.c
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdint.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#if HAVE_XENCTRL
+#define __XEN_INTERFACE_VERSION__ 0x00040900
+#include <xen/xen.h>
+#include <xen/kexec.h>
+#include <xen/sys/privcmd.h>
+#endif
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "proc-cmdline.h"
+#include "raw-reboot.h"
+#include "reboot-util.h"
+#include "string-util.h"
+#include "umask-util.h"
+#include "virt.h"
+
+int update_reboot_parameter_and_warn(const char *parameter, bool keep) {
+ int r;
+
+ if (isempty(parameter)) {
+ if (keep)
+ return 0;
+
+ if (unlink("/run/systemd/reboot-param") < 0) {
+ if (errno == ENOENT)
+ return 0;
+
+ return log_warning_errno(errno, "Failed to unlink reboot parameter file: %m");
+ }
+
+ return 0;
+ }
+
+ WITH_UMASK(0022) {
+ r = write_string_file("/run/systemd/reboot-param", parameter,
+ WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to write reboot parameter file: %m");
+ }
+
+ return 0;
+}
+
+int read_reboot_parameter(char **parameter) {
+ int r;
+
+ assert(parameter);
+
+ r = read_one_line_file("/run/systemd/reboot-param", parameter);
+ if (r < 0 && r != -ENOENT)
+ return log_debug_errno(r, "Failed to read /run/systemd/reboot-param: %m");
+
+ return 0;
+}
+
+int reboot_with_parameter(RebootFlags flags) {
+ int r;
+
+ /* Reboots the system with a parameter that is read from /run/systemd/reboot-param. Returns 0 if
+ * REBOOT_DRY_RUN was set and the actual reboot operation was hence skipped. If REBOOT_FALLBACK is
+ * set and the reboot with parameter doesn't work out a fallback to classic reboot() is attempted. If
+ * REBOOT_FALLBACK is not set, 0 is returned instead, which should be considered indication for the
+ * caller to fall back to reboot() on its own, or somehow else deal with this. If REBOOT_LOG is
+ * specified will log about what it is going to do, as well as all errors. */
+
+ if (detect_container() == 0) {
+ _cleanup_free_ char *parameter = NULL;
+
+ r = read_one_line_file("/run/systemd/reboot-param", &parameter);
+ if (r < 0 && r != -ENOENT)
+ log_full_errno(flags & REBOOT_LOG ? LOG_WARNING : LOG_DEBUG, r,
+ "Failed to read reboot parameter file, ignoring: %m");
+
+ if (!isempty(parameter)) {
+ log_full(flags & REBOOT_LOG ? LOG_INFO : LOG_DEBUG,
+ "Rebooting with argument '%s'.", parameter);
+
+ if (flags & REBOOT_DRY_RUN)
+ return 0;
+
+ (void) raw_reboot(LINUX_REBOOT_CMD_RESTART2, parameter);
+
+ log_full_errno(flags & REBOOT_LOG ? LOG_WARNING : LOG_DEBUG, errno,
+ "Failed to reboot with parameter, retrying without: %m");
+ }
+ }
+
+ if (!(flags & REBOOT_FALLBACK))
+ return 0;
+
+ log_full(flags & REBOOT_LOG ? LOG_INFO : LOG_DEBUG, "Rebooting.");
+
+ if (flags & REBOOT_DRY_RUN)
+ return 0;
+
+ (void) reboot(RB_AUTOBOOT);
+
+ return log_full_errno(flags & REBOOT_LOG ? LOG_ERR : LOG_DEBUG, errno, "Failed to reboot: %m");
+}
+
+bool shall_restore_state(void) {
+ static int cached = -1;
+ bool b = true; /* If nothing specified or the check fails, then defaults to true. */
+ int r;
+
+ if (cached >= 0)
+ return cached;
+
+ r = proc_cmdline_get_bool("systemd.restore_state", PROC_CMDLINE_TRUE_WHEN_MISSING, &b);
+ if (r < 0)
+ log_debug_errno(r, "Failed to parse systemd.restore_state= kernel command line option, ignoring: %m");
+
+ return (cached = b);
+}
+
+static int xen_kexec_loaded(void) {
+#if HAVE_XENCTRL
+ _cleanup_close_ int privcmd_fd = -EBADF, buf_fd = -EBADF;
+ xen_kexec_status_t *buffer;
+ size_t size;
+ int r;
+
+ if (access("/proc/xen", F_OK) < 0) {
+ if (errno == ENOENT)
+ return -EOPNOTSUPP;
+ return log_debug_errno(errno, "Unable to test whether /proc/xen exists: %m");
+ }
+
+ size = page_size();
+ if (sizeof(xen_kexec_status_t) > size)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "page_size is too small for hypercall");
+
+ privcmd_fd = open("/dev/xen/privcmd", O_RDWR|O_CLOEXEC);
+ if (privcmd_fd < 0)
+ return log_debug_errno(errno, "Cannot access /dev/xen/privcmd: %m");
+
+ buf_fd = open("/dev/xen/hypercall", O_RDWR|O_CLOEXEC);
+ if (buf_fd < 0)
+ return log_debug_errno(errno, "Cannot access /dev/xen/hypercall: %m");
+
+ buffer = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, buf_fd, 0);
+ if (buffer == MAP_FAILED)
+ return log_debug_errno(errno, "Cannot allocate buffer for hypercall: %m");
+
+ *buffer = (xen_kexec_status_t) {
+ .type = KEXEC_TYPE_DEFAULT,
+ };
+
+ privcmd_hypercall_t call = {
+ .op = __HYPERVISOR_kexec_op,
+ .arg = {
+ KEXEC_CMD_kexec_status,
+ PTR_TO_UINT64(buffer),
+ },
+ };
+
+ r = RET_NERRNO(ioctl(privcmd_fd, IOCTL_PRIVCMD_HYPERCALL, &call));
+ if (r < 0)
+ log_debug_errno(r, "kexec_status failed: %m");
+
+ munmap(buffer, size);
+
+ return r;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+bool kexec_loaded(void) {
+ _cleanup_free_ char *s = NULL;
+ int r;
+
+ r = xen_kexec_loaded();
+ if (r >= 0)
+ return r;
+
+ r = read_one_line_file("/sys/kernel/kexec_loaded", &s);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_debug_errno(r, "Unable to read /sys/kernel/kexec_loaded, ignoring: %m");
+ return false;
+ }
+
+ return s[0] == '1';
+}
diff --git a/src/shared/reboot-util.h b/src/shared/reboot-util.h
new file mode 100644
index 0000000..ccd15c7
--- /dev/null
+++ b/src/shared/reboot-util.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+int update_reboot_parameter_and_warn(const char *parameter, bool keep);
+
+typedef enum RebootFlags {
+ REBOOT_LOG = 1 << 0, /* log about what we are going to do and all errors */
+ REBOOT_DRY_RUN = 1 << 1, /* return 0 right before actually doing the reboot */
+ REBOOT_FALLBACK = 1 << 2, /* fall back to plain reboot() if argument-based reboot doesn't work, isn't configured or doesn't apply otherwise */
+} RebootFlags;
+
+int read_reboot_parameter(char **parameter);
+int reboot_with_parameter(RebootFlags flags);
+
+bool shall_restore_state(void);
+
+bool kexec_loaded(void);
diff --git a/src/shared/recovery-key.c b/src/shared/recovery-key.c
new file mode 100644
index 0000000..6a2f4d0
--- /dev/null
+++ b/src/shared/recovery-key.c
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "memory-util.h"
+#include "random-util.h"
+#include "recovery-key.h"
+
+const char modhex_alphabet[16] = {
+ 'c', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'n', 'r', 't', 'u', 'v'
+};
+
+int decode_modhex_char(char x) {
+
+ for (size_t i = 0; i < ELEMENTSOF(modhex_alphabet); i++)
+ /* Check both upper and lowercase */
+ if (modhex_alphabet[i] == x || (modhex_alphabet[i] - 32) == x)
+ return i;
+
+ return -EINVAL;
+}
+
+int normalize_recovery_key(const char *password, char **ret) {
+ _cleanup_(erase_and_freep) char *mangled = NULL;
+ size_t l;
+
+ assert(password);
+ assert(ret);
+
+ l = strlen(password);
+ if (!IN_SET(l,
+ RECOVERY_KEY_MODHEX_RAW_LENGTH*2, /* syntax without dashes */
+ RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1)) /* syntax with dashes */
+ return -EINVAL;
+
+ mangled = new(char, RECOVERY_KEY_MODHEX_FORMATTED_LENGTH);
+ if (!mangled)
+ return -ENOMEM;
+
+ for (size_t i = 0, j = 0; i < RECOVERY_KEY_MODHEX_RAW_LENGTH; i++) {
+ size_t k;
+ int a, b;
+
+ if (l == RECOVERY_KEY_MODHEX_RAW_LENGTH*2)
+ /* Syntax without dashes */
+ k = i * 2;
+ else {
+ /* Syntax with dashes */
+ assert(l == RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1);
+ k = i * 2 + i / 4;
+
+ if (i > 0 && i % 4 == 0 && password[k-1] != '-')
+ return -EINVAL;
+ }
+
+ a = decode_modhex_char(password[k]);
+ if (a < 0)
+ return -EINVAL;
+ b = decode_modhex_char(password[k+1]);
+ if (b < 0)
+ return -EINVAL;
+
+ mangled[j++] = modhex_alphabet[a];
+ mangled[j++] = modhex_alphabet[b];
+
+ if (i % 4 == 3)
+ mangled[j++] = '-';
+ }
+
+ mangled[RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1] = 0;
+
+ *ret = TAKE_PTR(mangled);
+ return 0;
+}
+
+int make_recovery_key(char **ret) {
+ _cleanup_(erase_and_freep) char *formatted = NULL;
+ _cleanup_(erase_and_freep) uint8_t *key = NULL;
+ size_t j = 0;
+ int r;
+
+ assert(ret);
+
+ key = new(uint8_t, RECOVERY_KEY_MODHEX_RAW_LENGTH);
+ if (!key)
+ return -ENOMEM;
+
+ r = crypto_random_bytes(key, RECOVERY_KEY_MODHEX_RAW_LENGTH);
+ if (r < 0)
+ return r;
+
+ /* Let's now format it as 64 modhex chars, and after each 8 chars insert a dash */
+ formatted = new(char, RECOVERY_KEY_MODHEX_FORMATTED_LENGTH);
+ if (!formatted)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < RECOVERY_KEY_MODHEX_RAW_LENGTH; i++) {
+ formatted[j++] = modhex_alphabet[key[i] >> 4];
+ formatted[j++] = modhex_alphabet[key[i] & 0xF];
+
+ if (i % 4 == 3)
+ formatted[j++] = '-';
+ }
+
+ assert(j == RECOVERY_KEY_MODHEX_FORMATTED_LENGTH);
+ assert(formatted[RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1] == '-');
+ formatted[RECOVERY_KEY_MODHEX_FORMATTED_LENGTH-1] = 0; /* replace final dash with a NUL */
+
+ *ret = TAKE_PTR(formatted);
+ return 0;
+}
diff --git a/src/shared/recovery-key.h b/src/shared/recovery-key.h
new file mode 100644
index 0000000..68e8051
--- /dev/null
+++ b/src/shared/recovery-key.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/* 256 bit keys = 32 bytes */
+#define RECOVERY_KEY_MODHEX_RAW_LENGTH 32
+
+/* Formatted as sequences of 64 modhex characters, with dashes inserted after multiples of 8 chars (incl. trailing NUL) */
+#define RECOVERY_KEY_MODHEX_FORMATTED_LENGTH (RECOVERY_KEY_MODHEX_RAW_LENGTH*2/8*9)
+
+int make_recovery_key(char **ret);
+
+extern const char modhex_alphabet[16];
+
+int decode_modhex_char(char x);
+
+int normalize_recovery_key(const char *password, char **ret);
diff --git a/src/shared/resize-fs.c b/src/shared/resize-fs.c
new file mode 100644
index 0000000..178aefa
--- /dev/null
+++ b/src/shared/resize-fs.c
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <linux/btrfs.h>
+#include <linux/magic.h>
+#include <sys/ioctl.h>
+#include <sys/vfs.h>
+
+#include "blockdev-util.h"
+#include "fs-util.h"
+#include "missing_fs.h"
+#include "missing_magic.h"
+#include "missing_xfs.h"
+#include "resize-fs.h"
+#include "stat-util.h"
+
+int resize_fs(int fd, uint64_t sz, uint64_t *ret_size) {
+ struct statfs sfs;
+
+ assert(fd >= 0);
+
+ /* Rounds down to next block size */
+
+ if (sz <= 0 || sz == UINT64_MAX)
+ return -ERANGE;
+
+ if (fstatfs(fd, &sfs) < 0)
+ return -errno;
+
+ if (is_fs_type(&sfs, EXT4_SUPER_MAGIC)) {
+ uint64_t u;
+
+ if (sz < EXT4_MINIMAL_SIZE)
+ return -ERANGE;
+
+ u = sz / sfs.f_bsize;
+
+ if (ioctl(fd, EXT4_IOC_RESIZE_FS, &u) < 0)
+ return -errno;
+
+ if (ret_size)
+ *ret_size = u * sfs.f_bsize;
+
+ } else if (is_fs_type(&sfs, BTRFS_SUPER_MAGIC)) {
+ struct btrfs_ioctl_vol_args args = {};
+
+ /* 256M is the minimize size enforced by the btrfs kernel code when resizing (which is
+ * strange btw, as mkfs.btrfs is fine creating file systems > 109M). It will return EINVAL in
+ * that case, let's catch this error beforehand though, and report a more explanatory
+ * error. */
+
+ if (sz < BTRFS_MINIMAL_SIZE)
+ return -ERANGE;
+
+ sz -= sz % sfs.f_bsize;
+
+ xsprintf(args.name, "%" PRIu64, sz);
+
+ if (ioctl(fd, BTRFS_IOC_RESIZE, &args) < 0)
+ return -errno;
+
+ if (ret_size)
+ *ret_size = sz;
+
+ } else if (is_fs_type(&sfs, XFS_SB_MAGIC)) {
+ xfs_fsop_geom_t geo;
+ xfs_growfs_data_t d;
+
+ if (sz < XFS_MINIMAL_SIZE)
+ return -ERANGE;
+
+ if (ioctl(fd, XFS_IOC_FSGEOMETRY, &geo) < 0)
+ return -errno;
+
+ d = (xfs_growfs_data_t) {
+ .imaxpct = geo.imaxpct,
+ .newblocks = sz / geo.blocksize,
+ };
+
+ if (ioctl(fd, XFS_IOC_FSGROWFSDATA, &d) < 0)
+ return -errno;
+
+ if (ret_size)
+ *ret_size = d.newblocks * geo.blocksize;
+
+ } else
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+uint64_t minimal_size_by_fs_magic(statfs_f_type_t magic) {
+
+ switch (magic) {
+
+ case (statfs_f_type_t) EXT4_SUPER_MAGIC:
+ return EXT4_MINIMAL_SIZE;
+
+ case (statfs_f_type_t) XFS_SB_MAGIC:
+ return XFS_MINIMAL_SIZE;
+
+ case (statfs_f_type_t) BTRFS_SUPER_MAGIC:
+ return BTRFS_MINIMAL_SIZE;
+
+ default:
+ return UINT64_MAX;
+ }
+}
+
+uint64_t minimal_size_by_fs_name(const char *name) {
+
+ if (streq_ptr(name, "ext4"))
+ return EXT4_MINIMAL_SIZE;
+
+ if (streq_ptr(name, "xfs"))
+ return XFS_MINIMAL_SIZE;
+
+ if (streq_ptr(name, "btrfs"))
+ return BTRFS_MINIMAL_SIZE;
+
+ return UINT64_MAX;
+}
+
+/* Returns true for the only fs that can online shrink *and* grow */
+bool fs_can_online_shrink_and_grow(statfs_f_type_t magic) {
+ return magic == (statfs_f_type_t) BTRFS_SUPER_MAGIC;
+}
diff --git a/src/shared/resize-fs.h b/src/shared/resize-fs.h
new file mode 100644
index 0000000..b40943c
--- /dev/null
+++ b/src/shared/resize-fs.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+
+#include "stat-util.h"
+
+int resize_fs(int fd, uint64_t sz, uint64_t *ret_size);
+
+#define BTRFS_MINIMAL_SIZE (256U*1024U*1024U)
+#define XFS_MINIMAL_SIZE (300U*1024U*1024U)
+#define EXT4_MINIMAL_SIZE (1024U*1024U)
+
+uint64_t minimal_size_by_fs_magic(statfs_f_type_t magic);
+uint64_t minimal_size_by_fs_name(const char *str);
+
+bool fs_can_online_shrink_and_grow(statfs_f_type_t magic);
diff --git a/src/shared/resolve-util.c b/src/shared/resolve-util.c
new file mode 100644
index 0000000..820f9bb
--- /dev/null
+++ b/src/shared/resolve-util.c
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "resolve-util.h"
+#include "string-table.h"
+
+DEFINE_CONFIG_PARSE_ENUM(config_parse_resolve_support, resolve_support, ResolveSupport, "Failed to parse resolve support setting");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_dnssec_mode, dnssec_mode, DnssecMode, "Failed to parse DNSSEC mode setting");
+DEFINE_CONFIG_PARSE_ENUM(config_parse_dns_over_tls_mode, dns_over_tls_mode, DnsOverTlsMode, "Failed to parse DNS-over-TLS mode setting");
+
+static const char* const resolve_support_table[_RESOLVE_SUPPORT_MAX] = {
+ [RESOLVE_SUPPORT_NO] = "no",
+ [RESOLVE_SUPPORT_YES] = "yes",
+ [RESOLVE_SUPPORT_RESOLVE] = "resolve",
+};
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(resolve_support, ResolveSupport, RESOLVE_SUPPORT_YES);
+
+static const char* const dnssec_mode_table[_DNSSEC_MODE_MAX] = {
+ [DNSSEC_NO] = "no",
+ [DNSSEC_ALLOW_DOWNGRADE] = "allow-downgrade",
+ [DNSSEC_YES] = "yes",
+};
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dnssec_mode, DnssecMode, DNSSEC_YES);
+
+static const char* const dns_over_tls_mode_table[_DNS_OVER_TLS_MODE_MAX] = {
+ [DNS_OVER_TLS_NO] = "no",
+ [DNS_OVER_TLS_OPPORTUNISTIC] = "opportunistic",
+ [DNS_OVER_TLS_YES] = "yes",
+};
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dns_over_tls_mode, DnsOverTlsMode, DNS_OVER_TLS_YES);
+
+bool dns_server_address_valid(int family, const union in_addr_union *sa) {
+
+ /* Refuses the 0 IP addresses as well as 127.0.0.53/127.0.0.54 (which is our own DNS stub) */
+
+ if (!in_addr_is_set(family, sa))
+ return false;
+
+ if (family == AF_INET && IN_SET(be32toh(sa->in.s_addr), INADDR_DNS_STUB, INADDR_DNS_PROXY_STUB))
+ return false;
+
+ return true;
+}
+
+DEFINE_CONFIG_PARSE_ENUM(config_parse_dns_cache_mode, dns_cache_mode, DnsCacheMode, "Failed to parse DNS cache mode setting")
+
+static const char* const dns_cache_mode_table[_DNS_CACHE_MODE_MAX] = {
+ [DNS_CACHE_MODE_YES] = "yes",
+ [DNS_CACHE_MODE_NO] = "no",
+ [DNS_CACHE_MODE_NO_NEGATIVE] = "no-negative",
+};
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(dns_cache_mode, DnsCacheMode, DNS_CACHE_MODE_YES);
diff --git a/src/shared/resolve-util.h b/src/shared/resolve-util.h
new file mode 100644
index 0000000..2d210f9
--- /dev/null
+++ b/src/shared/resolve-util.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "conf-parser.h"
+#include "in-addr-util.h"
+#include "macro.h"
+
+/* 127.0.0.53 in native endian (The IP address we listen on with the full DNS stub, i.e. that does LLMNR/mDNS, and stuff) */
+#define INADDR_DNS_STUB ((in_addr_t) 0x7f000035U)
+
+/* 127.0.0.54 in native endian (The IP address we listen on we only implement "proxy" mode) */
+#define INADDR_DNS_PROXY_STUB ((in_addr_t) 0x7f000036U)
+
+/* 127.0.0.2 is an address we always map to the local hostname. This is different from 127.0.0.1 which maps to "localhost" */
+#define INADDR_LOCALADDRESS ((in_addr_t) 0x7f000002U)
+
+typedef enum DnsCacheMode DnsCacheMode;
+
+enum DnsCacheMode {
+ DNS_CACHE_MODE_NO,
+ DNS_CACHE_MODE_YES,
+ DNS_CACHE_MODE_NO_NEGATIVE,
+ _DNS_CACHE_MODE_MAX,
+ _DNS_CACHE_MODE_INVALID = -EINVAL,
+};
+
+typedef enum ResolveSupport ResolveSupport;
+typedef enum DnssecMode DnssecMode;
+typedef enum DnsOverTlsMode DnsOverTlsMode;
+
+/* Do not change the order, see link_get_llmnr_support() or link_get_mdns_support(). */
+enum ResolveSupport {
+ RESOLVE_SUPPORT_NO,
+ RESOLVE_SUPPORT_RESOLVE,
+ RESOLVE_SUPPORT_YES,
+ _RESOLVE_SUPPORT_MAX,
+ _RESOLVE_SUPPORT_INVALID = -EINVAL,
+};
+
+enum DnssecMode {
+ /* No DNSSEC validation is done */
+ DNSSEC_NO,
+
+ /* Validate locally, if the server knows DO, but if not,
+ * don't. Don't trust the AD bit. If the server doesn't do
+ * DNSSEC properly, downgrade to non-DNSSEC operation. Of
+ * course, we then are vulnerable to a downgrade attack, but
+ * that's life and what is configured. */
+ DNSSEC_ALLOW_DOWNGRADE,
+
+ /* Insist on DNSSEC server support, and rather fail than downgrading. */
+ DNSSEC_YES,
+
+ _DNSSEC_MODE_MAX,
+ _DNSSEC_MODE_INVALID = -EINVAL,
+};
+
+enum DnsOverTlsMode {
+ /* No connection is made for DNS-over-TLS */
+ DNS_OVER_TLS_NO,
+
+ /* Try to connect using DNS-over-TLS, but if connection fails,
+ * fall back to using an unencrypted connection */
+ DNS_OVER_TLS_OPPORTUNISTIC,
+
+ /* Enforce DNS-over-TLS and require valid server certificates */
+ DNS_OVER_TLS_YES,
+
+ _DNS_OVER_TLS_MODE_MAX,
+ _DNS_OVER_TLS_MODE_INVALID = -EINVAL,
+};
+
+CONFIG_PARSER_PROTOTYPE(config_parse_resolve_support);
+CONFIG_PARSER_PROTOTYPE(config_parse_dnssec_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_dns_over_tls_mode);
+CONFIG_PARSER_PROTOTYPE(config_parse_dns_cache_mode);
+
+const char* resolve_support_to_string(ResolveSupport p) _const_;
+ResolveSupport resolve_support_from_string(const char *s) _pure_;
+
+const char* dnssec_mode_to_string(DnssecMode p) _const_;
+DnssecMode dnssec_mode_from_string(const char *s) _pure_;
+
+const char* dns_over_tls_mode_to_string(DnsOverTlsMode p) _const_;
+DnsOverTlsMode dns_over_tls_mode_from_string(const char *s) _pure_;
+
+bool dns_server_address_valid(int family, const union in_addr_union *sa);
+
+const char* dns_cache_mode_to_string(DnsCacheMode p) _const_;
+DnsCacheMode dns_cache_mode_from_string(const char *s) _pure_;
+
+/* A resolv.conf file containing the DNS server and domain data we learnt from uplink, i.e. the full uplink data */
+#define PRIVATE_UPLINK_RESOLV_CONF "/run/systemd/resolve/resolv.conf"
+
+/* A resolv.conf file containing the domain data we learnt from uplink, but our own DNS server address. */
+#define PRIVATE_STUB_RESOLV_CONF "/run/systemd/resolve/stub-resolv.conf"
+
+/* A static resolv.conf file containing no domains, but only our own DNS server address */
+#define PRIVATE_STATIC_RESOLV_CONF LIBEXECDIR "/resolv.conf"
diff --git a/src/shared/rm-rf.c b/src/shared/rm-rf.c
new file mode 100644
index 0000000..4664215
--- /dev/null
+++ b/src/shared/rm-rf.c
@@ -0,0 +1,519 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "btrfs-util.h"
+#include "cgroup-util.h"
+#include "dirent-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "macro.h"
+#include "mountpoint-util.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "stat-util.h"
+#include "string-util.h"
+
+/* We treat tmpfs/ramfs + cgroupfs as non-physical file systems. cgroupfs is similar to tmpfs in a way
+ * after all: we can create arbitrary directory hierarchies in it, and hence can also use rm_rf() on it
+ * to remove those again. */
+static bool is_physical_fs(const struct statfs *sfs) {
+ return !is_temporary_fs(sfs) && !is_cgroup_fs(sfs);
+}
+
+static int patch_dirfd_mode(
+ int dfd,
+ bool refuse_already_set,
+ mode_t *ret_old_mode) {
+
+ struct stat st;
+ int r;
+
+ assert(dfd >= 0);
+ assert(ret_old_mode);
+
+ if (fstat(dfd, &st) < 0)
+ return -errno;
+ if (!S_ISDIR(st.st_mode))
+ return -ENOTDIR;
+
+ if (FLAGS_SET(st.st_mode, 0700)) { /* Already set? */
+ if (refuse_already_set)
+ return -EACCES; /* original error */
+
+ *ret_old_mode = st.st_mode;
+ return 0;
+ }
+
+ if (st.st_uid != geteuid()) /* this only works if the UID matches ours */
+ return -EACCES;
+
+ r = fchmod_opath(dfd, (st.st_mode | 0700) & 07777);
+ if (r < 0)
+ return r;
+
+ *ret_old_mode = st.st_mode;
+ return 1;
+}
+
+int unlinkat_harder(int dfd, const char *filename, int unlink_flags, RemoveFlags remove_flags) {
+ mode_t old_mode;
+ int r;
+
+ /* Like unlinkat(), but tries harder: if we get EACCESS we'll try to set the r/w/x bits on the
+ * directory. This is useful if we run unprivileged and have some files where the w bit is
+ * missing. */
+
+ if (unlinkat(dfd, filename, unlink_flags) >= 0)
+ return 0;
+ if (errno != EACCES || !FLAGS_SET(remove_flags, REMOVE_CHMOD))
+ return -errno;
+
+ r = patch_dirfd_mode(dfd, /* refuse_already_set = */ true, &old_mode);
+ if (r < 0)
+ return r;
+
+ if (unlinkat(dfd, filename, unlink_flags) < 0) {
+ r = -errno;
+ /* Try to restore the original access mode if this didn't work */
+ (void) fchmod(dfd, old_mode & 07777);
+ return r;
+ }
+
+ if (FLAGS_SET(remove_flags, REMOVE_CHMOD_RESTORE) && fchmod(dfd, old_mode & 07777) < 0)
+ return -errno;
+
+ /* If this worked, we won't reset the old mode by default, since we'll need it for other entries too,
+ * and we should destroy the whole thing */
+ return 0;
+}
+
+int fstatat_harder(int dfd,
+ const char *filename,
+ struct stat *ret,
+ int fstatat_flags,
+ RemoveFlags remove_flags) {
+
+ mode_t old_mode;
+ int r;
+
+ /* Like unlink_harder() but does the same for fstatat() */
+
+ if (fstatat(dfd, filename, ret, fstatat_flags) >= 0)
+ return 0;
+ if (errno != EACCES || !FLAGS_SET(remove_flags, REMOVE_CHMOD))
+ return -errno;
+
+ r = patch_dirfd_mode(dfd, /* refuse_already_set = */ true, &old_mode);
+ if (r < 0)
+ return r;
+
+ if (fstatat(dfd, filename, ret, fstatat_flags) < 0) {
+ r = -errno;
+ (void) fchmod(dfd, old_mode & 07777);
+ return r;
+ }
+
+ if (FLAGS_SET(remove_flags, REMOVE_CHMOD_RESTORE) && fchmod(dfd, old_mode & 07777) < 0)
+ return -errno;
+
+ return 0;
+}
+
+static int openat_harder(int dfd, const char *path, int open_flags, RemoveFlags remove_flags, mode_t *ret_old_mode) {
+ _cleanup_close_ int pfd = -EBADF, fd = -EBADF;
+ bool chmod_done = false;
+ mode_t old_mode;
+ int r;
+
+ assert(dfd >= 0 || dfd == AT_FDCWD);
+ assert(path);
+
+ /* Unlike unlink_harder() and fstatat_harder(), this chmod the specified path. */
+
+ if (FLAGS_SET(open_flags, O_PATH) ||
+ !FLAGS_SET(open_flags, O_DIRECTORY) ||
+ !FLAGS_SET(remove_flags, REMOVE_CHMOD)) {
+
+ fd = RET_NERRNO(openat(dfd, path, open_flags));
+ if (fd < 0)
+ return fd;
+
+ if (ret_old_mode) {
+ struct stat st;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ *ret_old_mode = st.st_mode;
+ }
+
+ return TAKE_FD(fd);
+ }
+
+ pfd = RET_NERRNO(openat(dfd, path, (open_flags & (O_CLOEXEC|O_DIRECTORY|O_NOFOLLOW)) | O_PATH));
+ if (pfd < 0)
+ return pfd;
+
+ if (FLAGS_SET(remove_flags, REMOVE_CHMOD)) {
+ r = patch_dirfd_mode(pfd, /* refuse_already_set = */ false, &old_mode);
+ if (r < 0)
+ return r;
+
+ chmod_done = r;
+ }
+
+ fd = fd_reopen(pfd, open_flags & ~O_NOFOLLOW);
+ if (fd < 0) {
+ if (chmod_done)
+ (void) fchmod_opath(pfd, old_mode & 07777);
+ return fd;
+ }
+
+ if (ret_old_mode)
+ *ret_old_mode = old_mode;
+
+ return TAKE_FD(fd);
+}
+
+static int rm_rf_children_impl(
+ int fd,
+ RemoveFlags flags,
+ const struct stat *root_dev,
+ mode_t old_mode);
+
+static int rm_rf_inner_child(
+ int fd,
+ const char *fname,
+ int is_dir,
+ RemoveFlags flags,
+ const struct stat *root_dev,
+ bool allow_recursion) {
+
+ struct stat st;
+ int r, q = 0;
+
+ assert(fd >= 0);
+ assert(fname);
+
+ if (is_dir < 0 ||
+ root_dev ||
+ (is_dir > 0 && (root_dev || (flags & REMOVE_SUBVOLUME)))) {
+
+ r = fstatat_harder(fd, fname, &st, AT_SYMLINK_NOFOLLOW, flags);
+ if (r < 0)
+ return r;
+
+ is_dir = S_ISDIR(st.st_mode);
+ }
+
+ if (is_dir) {
+ /* If root_dev is set, remove subdirectories only if device is same */
+ if (root_dev && st.st_dev != root_dev->st_dev)
+ return 0;
+
+ /* Stop at mount points */
+ r = fd_is_mount_point(fd, fname, 0);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ return 0;
+
+ if ((flags & REMOVE_SUBVOLUME) && btrfs_might_be_subvol(&st)) {
+ /* This could be a subvolume, try to remove it */
+
+ r = btrfs_subvol_remove_at(fd, fname, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
+ if (r < 0) {
+ if (!IN_SET(r, -ENOTTY, -EINVAL))
+ return r;
+
+ /* ENOTTY, then it wasn't a btrfs subvolume, continue below. */
+ } else
+ /* It was a subvolume, done. */
+ return 1;
+ }
+
+ if (!allow_recursion)
+ return -EISDIR;
+
+ mode_t old_mode;
+ int subdir_fd = openat_harder(fd, fname,
+ O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME,
+ flags, &old_mode);
+ if (subdir_fd < 0)
+ return subdir_fd;
+
+ /* We pass REMOVE_PHYSICAL here, to avoid doing the fstatfs() to check the file system type
+ * again for each directory */
+ q = rm_rf_children_impl(subdir_fd, flags | REMOVE_PHYSICAL, root_dev, old_mode);
+
+ } else if (flags & REMOVE_ONLY_DIRECTORIES)
+ return 0;
+
+ r = unlinkat_harder(fd, fname, is_dir ? AT_REMOVEDIR : 0, flags);
+ if (r < 0)
+ return r;
+ if (q < 0)
+ return q;
+ return 1;
+}
+
+typedef struct TodoEntry {
+ DIR *dir; /* A directory that we were operating on. */
+ char *dirname; /* The filename of that directory itself. */
+ mode_t old_mode; /* The original file mode. */
+} TodoEntry;
+
+static void free_todo_entries(TodoEntry **todos) {
+ for (TodoEntry *x = *todos; x && x->dir; x++) {
+ closedir(x->dir);
+ free(x->dirname);
+ }
+
+ freep(todos);
+}
+
+int rm_rf_children(
+ int fd,
+ RemoveFlags flags,
+ const struct stat *root_dev) {
+
+ struct stat st;
+
+ assert(fd >= 0);
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ return rm_rf_children_impl(fd, flags, root_dev, st.st_mode);
+}
+
+static int rm_rf_children_impl(
+ int fd,
+ RemoveFlags flags,
+ const struct stat *root_dev,
+ mode_t old_mode) {
+
+ _cleanup_(free_todo_entries) TodoEntry *todos = NULL;
+ size_t n_todo = 0;
+ _cleanup_free_ char *dirname = NULL; /* Set when we are recursing and want to delete ourselves */
+ int ret = 0, r;
+
+ /* Return the first error we run into, but nevertheless try to go on.
+ * The passed fd is closed in all cases, including on failure. */
+
+ for (;;) { /* This loop corresponds to the directory nesting level. */
+ _cleanup_closedir_ DIR *d = NULL;
+
+ if (n_todo > 0) {
+ /* We know that we are in recursion here, because n_todo is set.
+ * We need to remove the inner directory we were operating on. */
+ assert(dirname);
+ r = unlinkat_harder(dirfd(todos[n_todo-1].dir), dirname, AT_REMOVEDIR, flags);
+ if (r < 0 && r != -ENOENT) {
+ if (ret == 0)
+ ret = r;
+
+ if (FLAGS_SET(flags, REMOVE_CHMOD_RESTORE))
+ (void) fchmodat(dirfd(todos[n_todo-1].dir), dirname, old_mode & 07777, 0);
+ }
+ dirname = mfree(dirname);
+
+ /* And now let's back out one level up */
+ n_todo --;
+ d = TAKE_PTR(todos[n_todo].dir);
+ dirname = TAKE_PTR(todos[n_todo].dirname);
+ old_mode = todos[n_todo].old_mode;
+
+ assert(d);
+ fd = dirfd(d); /* Retrieve the file descriptor from the DIR object */
+ assert(fd >= 0);
+ } else {
+ next_fd:
+ assert(fd >= 0);
+ d = fdopendir(fd);
+ if (!d) {
+ safe_close(fd);
+ return -errno;
+ }
+ fd = dirfd(d); /* We donated the fd to fdopendir(). Let's make sure we sure we have
+ * the right descriptor even if it were to internally invalidate the
+ * one we passed. */
+
+ if (!(flags & REMOVE_PHYSICAL)) {
+ struct statfs sfs;
+
+ if (fstatfs(fd, &sfs) < 0)
+ return -errno;
+
+ if (is_physical_fs(&sfs)) {
+ /* We refuse to clean physical file systems with this call, unless
+ * explicitly requested. This is extra paranoia just to be sure we
+ * never ever remove non-state data. */
+
+ _cleanup_free_ char *path = NULL;
+
+ (void) fd_get_path(fd, &path);
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Attempted to remove disk file system under \"%s\", and we can't allow that.",
+ strna(path));
+ }
+ }
+ }
+
+ FOREACH_DIRENT_ALL(de, d, return -errno) {
+ int is_dir;
+
+ if (dot_or_dot_dot(de->d_name))
+ continue;
+
+ is_dir = de->d_type == DT_UNKNOWN ? -1 : de->d_type == DT_DIR;
+
+ r = rm_rf_inner_child(fd, de->d_name, is_dir, flags, root_dev, false);
+ if (r == -EISDIR) {
+ /* Push the current working state onto the todo list */
+
+ if (!GREEDY_REALLOC0(todos, n_todo + 2))
+ return log_oom();
+
+ _cleanup_free_ char *newdirname = strdup(de->d_name);
+ if (!newdirname)
+ return log_oom();
+
+ mode_t mode;
+ int newfd = openat_harder(fd, de->d_name,
+ O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME,
+ flags, &mode);
+ if (newfd >= 0) {
+ todos[n_todo++] = (TodoEntry) {
+ .dir = TAKE_PTR(d),
+ .dirname = TAKE_PTR(dirname),
+ .old_mode = old_mode
+ };
+
+ fd = newfd;
+ dirname = TAKE_PTR(newdirname);
+ old_mode = mode;
+
+ goto next_fd;
+
+ } else if (newfd != -ENOENT && ret == 0)
+ ret = newfd;
+
+ } else if (r < 0 && r != -ENOENT && ret == 0)
+ ret = r;
+ }
+
+ if (FLAGS_SET(flags, REMOVE_SYNCFS) && syncfs(fd) < 0 && ret >= 0)
+ ret = -errno;
+
+ if (n_todo == 0) {
+ if (FLAGS_SET(flags, REMOVE_CHMOD_RESTORE) &&
+ fchmod(fd, old_mode & 07777) < 0 && ret >= 0)
+ ret = -errno;
+
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int rm_rf_at(int dir_fd, const char *path, RemoveFlags flags) {
+ mode_t old_mode;
+ int fd, r, q = 0;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(path);
+
+ /* For now, don't support dropping subvols when also only dropping directories, since we can't do
+ * this race-freely. */
+ if (FLAGS_SET(flags, REMOVE_ONLY_DIRECTORIES|REMOVE_SUBVOLUME))
+ return -EINVAL;
+
+ /* We refuse to clean the root file system with this call. This is extra paranoia to never cause a
+ * really seriously broken system. */
+ if (path_is_root_at(dir_fd, path) > 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Attempted to remove entire root file system, and we can't allow that.");
+
+ if (FLAGS_SET(flags, REMOVE_SUBVOLUME | REMOVE_ROOT | REMOVE_PHYSICAL)) {
+ /* Try to remove as subvolume first */
+ r = btrfs_subvol_remove_at(dir_fd, path, BTRFS_REMOVE_RECURSIVE|BTRFS_REMOVE_QUOTA);
+ if (r >= 0)
+ return r;
+
+ if (FLAGS_SET(flags, REMOVE_MISSING_OK) && r == -ENOENT)
+ return 0;
+
+ if (!IN_SET(r, -ENOTTY, -EINVAL, -ENOTDIR))
+ return r;
+
+ /* Not btrfs or not a subvolume */
+ }
+
+ fd = openat_harder(dir_fd, path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME, flags, &old_mode);
+ if (fd >= 0) {
+ /* We have a dir */
+ r = rm_rf_children_impl(fd, flags, NULL, old_mode);
+
+ if (FLAGS_SET(flags, REMOVE_ROOT))
+ q = RET_NERRNO(unlinkat(dir_fd, path, AT_REMOVEDIR));
+ } else {
+ r = fd;
+ if (FLAGS_SET(flags, REMOVE_MISSING_OK) && r == -ENOENT)
+ return 0;
+
+ if (!IN_SET(r, -ENOTDIR, -ELOOP))
+ return r;
+
+ if (FLAGS_SET(flags, REMOVE_ONLY_DIRECTORIES) || !FLAGS_SET(flags, REMOVE_ROOT))
+ return 0;
+
+ if (!FLAGS_SET(flags, REMOVE_PHYSICAL)) {
+ struct statfs s;
+
+ r = xstatfsat(dir_fd, path, &s);
+ if (r < 0)
+ return r;
+ if (is_physical_fs(&s))
+ return log_error_errno(SYNTHETIC_ERRNO(EPERM),
+ "Attempted to remove files from a disk file system under \"%s\", refusing.",
+ path);
+ }
+
+ r = 0;
+ q = RET_NERRNO(unlinkat(dir_fd, path, 0));
+ }
+
+ if (r < 0)
+ return r;
+ if (q < 0 && (q != -ENOENT || !FLAGS_SET(flags, REMOVE_MISSING_OK)))
+ return q;
+ return 0;
+}
+
+int rm_rf_child(int fd, const char *name, RemoveFlags flags) {
+
+ /* Removes one specific child of the specified directory */
+
+ if (fd < 0)
+ return -EBADF;
+
+ if (!filename_is_valid(name))
+ return -EINVAL;
+
+ if ((flags & (REMOVE_ROOT|REMOVE_MISSING_OK)) != 0) /* Doesn't really make sense here, we are not supposed to remove 'fd' anyway */
+ return -EINVAL;
+
+ if (FLAGS_SET(flags, REMOVE_ONLY_DIRECTORIES|REMOVE_SUBVOLUME))
+ return -EINVAL;
+
+ return rm_rf_inner_child(fd, name, -1, flags, NULL, true);
+}
diff --git a/src/shared/rm-rf.h b/src/shared/rm-rf.h
new file mode 100644
index 0000000..6e52bbb
--- /dev/null
+++ b/src/shared/rm-rf.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+
+typedef enum RemoveFlags {
+ REMOVE_ONLY_DIRECTORIES = 1 << 0, /* Only remove empty directories, no files */
+ REMOVE_ROOT = 1 << 1, /* Remove the specified directory itself too, not just the contents of it */
+ REMOVE_PHYSICAL = 1 << 2, /* If not set, only removes files on tmpfs, never physical file systems */
+ REMOVE_SUBVOLUME = 1 << 3, /* Drop btrfs subvolumes in the tree too */
+ REMOVE_MISSING_OK = 1 << 4, /* If the top-level directory is missing, ignore the ENOENT for it */
+ REMOVE_CHMOD = 1 << 5, /* chmod() for write access if we cannot delete or access something */
+ REMOVE_CHMOD_RESTORE = 1 << 6, /* Restore the old mode before returning */
+ REMOVE_SYNCFS = 1 << 7, /* syncfs() the root of the specified directory after removing everything in it */
+} RemoveFlags;
+
+int unlinkat_harder(int dfd, const char *filename, int unlink_flags, RemoveFlags remove_flags);
+int fstatat_harder(int dfd,
+ const char *filename,
+ struct stat *ret,
+ int fstatat_flags,
+ RemoveFlags remove_flags);
+
+/* Note: directory file descriptors passed to the functions below must be
+ * positioned at the beginning. If the fd was already used for reading, rewind it. */
+int rm_rf_children(int fd, RemoveFlags flags, const struct stat *root_dev);
+int rm_rf_child(int fd, const char *name, RemoveFlags flags);
+int rm_rf_at(int dir_fd, const char *path, RemoveFlags flags);
+static inline int rm_rf(const char *path, RemoveFlags flags) {
+ return rm_rf_at(AT_FDCWD, path, flags);
+}
+
+/* Useful for usage with _cleanup_(), destroys a directory and frees the pointer */
+static inline char *rm_rf_physical_and_free(char *p) {
+ PROTECT_ERRNO;
+
+ if (!p)
+ return NULL;
+
+ (void) rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_MISSING_OK|REMOVE_CHMOD);
+ return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, rm_rf_physical_and_free);
+
+/* Similar as above, but also has magic btrfs subvolume powers */
+static inline char *rm_rf_subvolume_and_free(char *p) {
+ PROTECT_ERRNO;
+
+ if (!p)
+ return NULL;
+
+ (void) rm_rf(p, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME|REMOVE_MISSING_OK|REMOVE_CHMOD);
+ return mfree(p);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, rm_rf_subvolume_and_free);
diff --git a/src/shared/seccomp-util.c b/src/shared/seccomp-util.c
new file mode 100644
index 0000000..00a8ced
--- /dev/null
+++ b/src/shared/seccomp-util.c
@@ -0,0 +1,2499 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/seccomp.h>
+#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+
+/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
+#include "missing_syscall_def.h"
+#include <seccomp.h>
+
+#include "af-list.h"
+#include "alloc-util.h"
+#include "env-util.h"
+#include "errno-list.h"
+#include "macro.h"
+#include "namespace-util.h"
+#include "nsflags.h"
+#include "nulstr-util.h"
+#include "process-util.h"
+#include "seccomp-util.h"
+#include "set.h"
+#include "string-util.h"
+#include "strv.h"
+
+/* This array will be modified at runtime as seccomp_restrict_archs is called. */
+uint32_t seccomp_local_archs[] = {
+
+ /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
+
+#if defined(__x86_64__) && defined(__ILP32__)
+ SCMP_ARCH_X86,
+ SCMP_ARCH_X86_64,
+ SCMP_ARCH_X32, /* native */
+#elif defined(__x86_64__) && !defined(__ILP32__)
+ SCMP_ARCH_X86,
+ SCMP_ARCH_X32,
+ SCMP_ARCH_X86_64, /* native */
+#elif defined(__i386__)
+ SCMP_ARCH_X86,
+#elif defined(__aarch64__)
+ SCMP_ARCH_ARM,
+ SCMP_ARCH_AARCH64, /* native */
+#elif defined(__arm__)
+ SCMP_ARCH_ARM,
+#elif defined(__loongarch_lp64)
+ SCMP_ARCH_LOONGARCH64,
+#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPS, /* native */
+#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPSEL, /* native */
+#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPSEL64N32,
+ SCMP_ARCH_MIPS64N32,
+ SCMP_ARCH_MIPSEL64,
+ SCMP_ARCH_MIPS64, /* native */
+#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPS64N32,
+ SCMP_ARCH_MIPSEL64N32,
+ SCMP_ARCH_MIPS64,
+ SCMP_ARCH_MIPSEL64, /* native */
+#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPSEL64,
+ SCMP_ARCH_MIPS64,
+ SCMP_ARCH_MIPSEL64N32,
+ SCMP_ARCH_MIPS64N32, /* native */
+#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
+ SCMP_ARCH_MIPS,
+ SCMP_ARCH_MIPSEL,
+ SCMP_ARCH_MIPS64,
+ SCMP_ARCH_MIPSEL64,
+ SCMP_ARCH_MIPS64N32,
+ SCMP_ARCH_MIPSEL64N32, /* native */
+#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
+ SCMP_ARCH_PARISC,
+ SCMP_ARCH_PARISC64, /* native */
+#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
+ SCMP_ARCH_PARISC,
+#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
+ SCMP_ARCH_PPC,
+ SCMP_ARCH_PPC64LE,
+ SCMP_ARCH_PPC64, /* native */
+#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
+ SCMP_ARCH_PPC,
+ SCMP_ARCH_PPC64,
+ SCMP_ARCH_PPC64LE, /* native */
+#elif defined(__powerpc__)
+ SCMP_ARCH_PPC,
+#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
+ SCMP_ARCH_RISCV64,
+#elif defined(__s390x__)
+ SCMP_ARCH_S390,
+ SCMP_ARCH_S390X, /* native */
+#elif defined(__s390__)
+ SCMP_ARCH_S390,
+#endif
+ SECCOMP_LOCAL_ARCH_END
+ };
+
+const char* seccomp_arch_to_string(uint32_t c) {
+ /* Maintain order used in <seccomp.h>.
+ *
+ * Names used here should be the same as those used for ConditionArchitecture=,
+ * except for "subarchitectures" like x32. */
+
+ switch (c) {
+ case SCMP_ARCH_NATIVE:
+ return "native";
+ case SCMP_ARCH_X86:
+ return "x86";
+ case SCMP_ARCH_X86_64:
+ return "x86-64";
+ case SCMP_ARCH_X32:
+ return "x32";
+ case SCMP_ARCH_ARM:
+ return "arm";
+ case SCMP_ARCH_AARCH64:
+ return "arm64";
+#ifdef SCMP_ARCH_LOONGARCH64
+ case SCMP_ARCH_LOONGARCH64:
+ return "loongarch64";
+#endif
+ case SCMP_ARCH_MIPS:
+ return "mips";
+ case SCMP_ARCH_MIPS64:
+ return "mips64";
+ case SCMP_ARCH_MIPS64N32:
+ return "mips64-n32";
+ case SCMP_ARCH_MIPSEL:
+ return "mips-le";
+ case SCMP_ARCH_MIPSEL64:
+ return "mips64-le";
+ case SCMP_ARCH_MIPSEL64N32:
+ return "mips64-le-n32";
+#ifdef SCMP_ARCH_PARISC
+ case SCMP_ARCH_PARISC:
+ return "parisc";
+#endif
+#ifdef SCMP_ARCH_PARISC64
+ case SCMP_ARCH_PARISC64:
+ return "parisc64";
+#endif
+ case SCMP_ARCH_PPC:
+ return "ppc";
+ case SCMP_ARCH_PPC64:
+ return "ppc64";
+ case SCMP_ARCH_PPC64LE:
+ return "ppc64-le";
+#ifdef SCMP_ARCH_RISCV64
+ case SCMP_ARCH_RISCV64:
+ return "riscv64";
+#endif
+ case SCMP_ARCH_S390:
+ return "s390";
+ case SCMP_ARCH_S390X:
+ return "s390x";
+ default:
+ return NULL;
+ }
+}
+
+int seccomp_arch_from_string(const char *n, uint32_t *ret) {
+ if (!n)
+ return -EINVAL;
+
+ assert(ret);
+
+ if (streq(n, "native"))
+ *ret = SCMP_ARCH_NATIVE;
+ else if (streq(n, "x86"))
+ *ret = SCMP_ARCH_X86;
+ else if (streq(n, "x86-64"))
+ *ret = SCMP_ARCH_X86_64;
+ else if (streq(n, "x32"))
+ *ret = SCMP_ARCH_X32;
+ else if (streq(n, "arm"))
+ *ret = SCMP_ARCH_ARM;
+ else if (streq(n, "arm64"))
+ *ret = SCMP_ARCH_AARCH64;
+#ifdef SCMP_ARCH_LOONGARCH64
+ else if (streq(n, "loongarch64"))
+ *ret = SCMP_ARCH_LOONGARCH64;
+#endif
+ else if (streq(n, "mips"))
+ *ret = SCMP_ARCH_MIPS;
+ else if (streq(n, "mips64"))
+ *ret = SCMP_ARCH_MIPS64;
+ else if (streq(n, "mips64-n32"))
+ *ret = SCMP_ARCH_MIPS64N32;
+ else if (streq(n, "mips-le"))
+ *ret = SCMP_ARCH_MIPSEL;
+ else if (streq(n, "mips64-le"))
+ *ret = SCMP_ARCH_MIPSEL64;
+ else if (streq(n, "mips64-le-n32"))
+ *ret = SCMP_ARCH_MIPSEL64N32;
+#ifdef SCMP_ARCH_PARISC
+ else if (streq(n, "parisc"))
+ *ret = SCMP_ARCH_PARISC;
+#endif
+#ifdef SCMP_ARCH_PARISC64
+ else if (streq(n, "parisc64"))
+ *ret = SCMP_ARCH_PARISC64;
+#endif
+ else if (streq(n, "ppc"))
+ *ret = SCMP_ARCH_PPC;
+ else if (streq(n, "ppc64"))
+ *ret = SCMP_ARCH_PPC64;
+ else if (streq(n, "ppc64-le"))
+ *ret = SCMP_ARCH_PPC64LE;
+#ifdef SCMP_ARCH_RISCV64
+ else if (streq(n, "riscv64"))
+ *ret = SCMP_ARCH_RISCV64;
+#endif
+ else if (streq(n, "s390"))
+ *ret = SCMP_ARCH_S390;
+ else if (streq(n, "s390x"))
+ *ret = SCMP_ARCH_S390X;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int r;
+
+ /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
+ * any others. Also, turns off the NNP fiddling. */
+
+ seccomp = seccomp_init(default_action);
+ if (!seccomp)
+ return -ENOMEM;
+
+ if (arch != SCMP_ARCH_NATIVE &&
+ arch != seccomp_arch_native()) {
+
+ r = seccomp_arch_remove(seccomp, seccomp_arch_native());
+ if (r < 0)
+ return r;
+
+ r = seccomp_arch_add(seccomp, arch);
+ if (r < 0)
+ return r;
+
+ assert(seccomp_arch_exist(seccomp, arch) >= 0);
+ assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
+ assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
+ } else {
+ assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
+ assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
+ }
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ return r;
+
+#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
+ if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable seccomp event logging: %m");
+ }
+#endif
+
+ *ret = TAKE_PTR(seccomp);
+ return 0;
+}
+
+static bool is_basic_seccomp_available(void) {
+ return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
+}
+
+static bool is_seccomp_filter_available(void) {
+ return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
+ errno == EFAULT;
+}
+
+bool is_seccomp_available(void) {
+ static int cached_enabled = -1;
+
+ if (cached_enabled < 0) {
+ int b;
+
+ b = getenv_bool_secure("SYSTEMD_SECCOMP");
+ if (b != 0) {
+ if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
+ log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
+
+ cached_enabled =
+ is_basic_seccomp_available() &&
+ is_seccomp_filter_available();
+ } else
+ cached_enabled = false;
+ }
+
+ return cached_enabled;
+}
+
+const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
+ [SYSCALL_FILTER_SET_DEFAULT] = {
+ .name = "@default",
+ .help = "System calls that are always permitted",
+ .value =
+ "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
+ "brk\0"
+ "cacheflush\0"
+ "clock_getres\0"
+ "clock_getres_time64\0"
+ "clock_gettime\0"
+ "clock_gettime64\0"
+ "clock_nanosleep\0"
+ "clock_nanosleep_time64\0"
+ "execve\0"
+ "exit\0"
+ "exit_group\0"
+ "futex\0"
+ "futex_time64\0"
+ "futex_waitv\0"
+ "get_robust_list\0"
+ "get_thread_area\0"
+ "getegid\0"
+ "getegid32\0"
+ "geteuid\0"
+ "geteuid32\0"
+ "getgid\0"
+ "getgid32\0"
+ "getgroups\0"
+ "getgroups32\0"
+ "getpgid\0"
+ "getpgrp\0"
+ "getpid\0"
+ "getppid\0"
+ "getrandom\0"
+ "getresgid\0"
+ "getresgid32\0"
+ "getresuid\0"
+ "getresuid32\0"
+ "getrlimit\0" /* make sure processes can query stack size and such */
+ "getsid\0"
+ "gettid\0"
+ "gettimeofday\0"
+ "getuid\0"
+ "getuid32\0"
+ "membarrier\0"
+ "mmap\0"
+ "mmap2\0"
+ "mprotect\0"
+ "munmap\0"
+ "nanosleep\0"
+ "pause\0"
+ "prlimit64\0"
+ "restart_syscall\0"
+ "riscv_flush_icache\0"
+ "riscv_hwprobe\0"
+ "rseq\0"
+ "rt_sigreturn\0"
+ "sched_getaffinity\0"
+ "sched_yield\0"
+ "set_robust_list\0"
+ "set_thread_area\0"
+ "set_tid_address\0"
+ "set_tls\0"
+ "sigreturn\0"
+ "time\0"
+ "ugetrlimit\0"
+ },
+ [SYSCALL_FILTER_SET_AIO] = {
+ .name = "@aio",
+ .help = "Asynchronous IO",
+ .value =
+ "io_cancel\0"
+ "io_destroy\0"
+ "io_getevents\0"
+ "io_pgetevents\0"
+ "io_pgetevents_time64\0"
+ "io_setup\0"
+ "io_submit\0"
+ "io_uring_enter\0"
+ "io_uring_register\0"
+ "io_uring_setup\0"
+ },
+ [SYSCALL_FILTER_SET_BASIC_IO] = {
+ .name = "@basic-io",
+ .help = "Basic IO",
+ .value =
+ "_llseek\0"
+ "close\0"
+ "close_range\0"
+ "dup\0"
+ "dup2\0"
+ "dup3\0"
+ "lseek\0"
+ "pread64\0"
+ "preadv\0"
+ "preadv2\0"
+ "pwrite64\0"
+ "pwritev\0"
+ "pwritev2\0"
+ "read\0"
+ "readv\0"
+ "write\0"
+ "writev\0"
+ },
+ [SYSCALL_FILTER_SET_CHOWN] = {
+ .name = "@chown",
+ .help = "Change ownership of files and directories",
+ .value =
+ "chown\0"
+ "chown32\0"
+ "fchown\0"
+ "fchown32\0"
+ "fchownat\0"
+ "lchown\0"
+ "lchown32\0"
+ },
+ [SYSCALL_FILTER_SET_CLOCK] = {
+ .name = "@clock",
+ .help = "Change the system time",
+ .value =
+ "adjtimex\0"
+ "clock_adjtime\0"
+ "clock_adjtime64\0"
+ "clock_settime\0"
+ "clock_settime64\0"
+ "settimeofday\0"
+ },
+ [SYSCALL_FILTER_SET_CPU_EMULATION] = {
+ .name = "@cpu-emulation",
+ .help = "System calls for CPU emulation functionality",
+ .value =
+ "modify_ldt\0"
+ "subpage_prot\0"
+ "switch_endian\0"
+ "vm86\0"
+ "vm86old\0"
+ },
+ [SYSCALL_FILTER_SET_DEBUG] = {
+ .name = "@debug",
+ .help = "Debugging, performance monitoring and tracing functionality",
+ .value =
+ "lookup_dcookie\0"
+ "perf_event_open\0"
+ "pidfd_getfd\0"
+ "ptrace\0"
+ "rtas\0"
+ "s390_runtime_instr\0"
+ "sys_debug_setcontext\0"
+ },
+ [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
+ .name = "@file-system",
+ .help = "File system operations",
+ .value =
+ "access\0"
+ "chdir\0"
+ "chmod\0"
+ "close\0"
+ "creat\0"
+ "faccessat\0"
+ "faccessat2\0"
+ "fallocate\0"
+ "fchdir\0"
+ "fchmod\0"
+ "fchmodat\0"
+ "fchmodat2\0"
+ "fcntl\0"
+ "fcntl64\0"
+ "fgetxattr\0"
+ "flistxattr\0"
+ "fremovexattr\0"
+ "fsetxattr\0"
+ "fstat\0"
+ "fstat64\0"
+ "fstatat64\0"
+ "fstatfs\0"
+ "fstatfs64\0"
+ "ftruncate\0"
+ "ftruncate64\0"
+ "futimesat\0"
+ "getcwd\0"
+ "getdents\0"
+ "getdents64\0"
+ "getxattr\0"
+ "inotify_add_watch\0"
+ "inotify_init\0"
+ "inotify_init1\0"
+ "inotify_rm_watch\0"
+ "lgetxattr\0"
+ "link\0"
+ "linkat\0"
+ "listxattr\0"
+ "llistxattr\0"
+ "lremovexattr\0"
+ "lsetxattr\0"
+ "lstat\0"
+ "lstat64\0"
+ "mkdir\0"
+ "mkdirat\0"
+ "mknod\0"
+ "mknodat\0"
+ "newfstatat\0"
+ "oldfstat\0"
+ "oldlstat\0"
+ "oldstat\0"
+ "open\0"
+ "openat\0"
+ "openat2\0"
+ "readlink\0"
+ "readlinkat\0"
+ "removexattr\0"
+ "rename\0"
+ "renameat\0"
+ "renameat2\0"
+ "rmdir\0"
+ "setxattr\0"
+ "stat\0"
+ "stat64\0"
+ "statfs\0"
+ "statfs64\0"
+ "statx\0"
+ "symlink\0"
+ "symlinkat\0"
+ "truncate\0"
+ "truncate64\0"
+ "unlink\0"
+ "unlinkat\0"
+ "utime\0"
+ "utimensat\0"
+ "utimensat_time64\0"
+ "utimes\0"
+ },
+ [SYSCALL_FILTER_SET_IO_EVENT] = {
+ .name = "@io-event",
+ .help = "Event loop system calls",
+ .value =
+ "_newselect\0"
+ "epoll_create\0"
+ "epoll_create1\0"
+ "epoll_ctl\0"
+ "epoll_ctl_old\0"
+ "epoll_pwait\0"
+ "epoll_pwait2\0"
+ "epoll_wait\0"
+ "epoll_wait_old\0"
+ "eventfd\0"
+ "eventfd2\0"
+ "poll\0"
+ "ppoll\0"
+ "ppoll_time64\0"
+ "pselect6\0"
+ "pselect6_time64\0"
+ "select\0"
+ },
+ [SYSCALL_FILTER_SET_IPC] = {
+ .name = "@ipc",
+ .help = "SysV IPC, POSIX Message Queues or other IPC",
+ .value =
+ "ipc\0"
+ "memfd_create\0"
+ "mq_getsetattr\0"
+ "mq_notify\0"
+ "mq_open\0"
+ "mq_timedreceive\0"
+ "mq_timedreceive_time64\0"
+ "mq_timedsend\0"
+ "mq_timedsend_time64\0"
+ "mq_unlink\0"
+ "msgctl\0"
+ "msgget\0"
+ "msgrcv\0"
+ "msgsnd\0"
+ "pipe\0"
+ "pipe2\0"
+ "process_madvise\0"
+ "process_vm_readv\0"
+ "process_vm_writev\0"
+ "semctl\0"
+ "semget\0"
+ "semop\0"
+ "semtimedop\0"
+ "semtimedop_time64\0"
+ "shmat\0"
+ "shmctl\0"
+ "shmdt\0"
+ "shmget\0"
+ },
+ [SYSCALL_FILTER_SET_KEYRING] = {
+ .name = "@keyring",
+ .help = "Kernel keyring access",
+ .value =
+ "add_key\0"
+ "keyctl\0"
+ "request_key\0"
+ },
+ [SYSCALL_FILTER_SET_MEMLOCK] = {
+ .name = "@memlock",
+ .help = "Memory locking control",
+ .value =
+ "mlock\0"
+ "mlock2\0"
+ "mlockall\0"
+ "munlock\0"
+ "munlockall\0"
+ },
+ [SYSCALL_FILTER_SET_MODULE] = {
+ .name = "@module",
+ .help = "Loading and unloading of kernel modules",
+ .value =
+ "delete_module\0"
+ "finit_module\0"
+ "init_module\0"
+ },
+ [SYSCALL_FILTER_SET_MOUNT] = {
+ .name = "@mount",
+ .help = "Mounting and unmounting of file systems",
+ .value =
+ "chroot\0"
+ "fsconfig\0"
+ "fsmount\0"
+ "fsopen\0"
+ "fspick\0"
+ "mount\0"
+ "mount_setattr\0"
+ "move_mount\0"
+ "open_tree\0"
+ "pivot_root\0"
+ "umount\0"
+ "umount2\0"
+ },
+ [SYSCALL_FILTER_SET_NETWORK_IO] = {
+ .name = "@network-io",
+ .help = "Network or Unix socket IO, should not be needed if not network facing",
+ .value =
+ "accept\0"
+ "accept4\0"
+ "bind\0"
+ "connect\0"
+ "getpeername\0"
+ "getsockname\0"
+ "getsockopt\0"
+ "listen\0"
+ "recv\0"
+ "recvfrom\0"
+ "recvmmsg\0"
+ "recvmmsg_time64\0"
+ "recvmsg\0"
+ "send\0"
+ "sendmmsg\0"
+ "sendmsg\0"
+ "sendto\0"
+ "setsockopt\0"
+ "shutdown\0"
+ "socket\0"
+ "socketcall\0"
+ "socketpair\0"
+ },
+ [SYSCALL_FILTER_SET_OBSOLETE] = {
+ /* some unknown even to libseccomp */
+ .name = "@obsolete",
+ .help = "Unusual, obsolete or unimplemented system calls",
+ .value =
+ "_sysctl\0"
+ "afs_syscall\0"
+ "bdflush\0"
+ "break\0"
+ "create_module\0"
+ "ftime\0"
+ "get_kernel_syms\0"
+ "getpmsg\0"
+ "gtty\0"
+ "idle\0"
+ "lock\0"
+ "mpx\0"
+ "prof\0"
+ "profil\0"
+ "putpmsg\0"
+ "query_module\0"
+ "security\0"
+ "sgetmask\0"
+ "ssetmask\0"
+ "stime\0"
+ "stty\0"
+ "sysfs\0"
+ "tuxcall\0"
+ "ulimit\0"
+ "uselib\0"
+ "ustat\0"
+ "vserver\0"
+ },
+ [SYSCALL_FILTER_SET_PKEY] = {
+ .name = "@pkey",
+ .help = "System calls used for memory protection keys",
+ .value =
+ "pkey_alloc\0"
+ "pkey_free\0"
+ "pkey_mprotect\0"
+ },
+ [SYSCALL_FILTER_SET_PRIVILEGED] = {
+ .name = "@privileged",
+ .help = "All system calls which need super-user capabilities",
+ .value =
+ "@chown\0"
+ "@clock\0"
+ "@module\0"
+ "@raw-io\0"
+ "@reboot\0"
+ "@swap\0"
+ "_sysctl\0"
+ "acct\0"
+ "bpf\0"
+ "capset\0"
+ "chroot\0"
+ "fanotify_init\0"
+ "fanotify_mark\0"
+ "nfsservctl\0"
+ "open_by_handle_at\0"
+ "pivot_root\0"
+ "quotactl\0"
+ "quotactl_fd\0"
+ "setdomainname\0"
+ "setfsuid\0"
+ "setfsuid32\0"
+ "setgroups\0"
+ "setgroups32\0"
+ "sethostname\0"
+ "setresuid\0"
+ "setresuid32\0"
+ "setreuid\0"
+ "setreuid32\0"
+ "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
+ "setuid32\0"
+ "vhangup\0"
+ },
+ [SYSCALL_FILTER_SET_PROCESS] = {
+ .name = "@process",
+ .help = "Process control, execution, namespacing operations",
+ .value =
+ "capget\0" /* Able to query arbitrary processes */
+ "clone\0"
+ /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
+ * implement seccomp, so we don't need to list it at all. C.f.
+ * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
+ "clone3\0"
+ "execveat\0"
+ "fork\0"
+ "getrusage\0"
+ "kill\0"
+ "pidfd_open\0"
+ "pidfd_send_signal\0"
+ "prctl\0"
+ "rt_sigqueueinfo\0"
+ "rt_tgsigqueueinfo\0"
+ "setns\0"
+ "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
+ "tgkill\0"
+ "times\0"
+ "tkill\0"
+ "unshare\0"
+ "vfork\0"
+ "wait4\0"
+ "waitid\0"
+ "waitpid\0"
+ },
+ [SYSCALL_FILTER_SET_RAW_IO] = {
+ .name = "@raw-io",
+ .help = "Raw I/O port access",
+ .value =
+ "ioperm\0"
+ "iopl\0"
+ "pciconfig_iobase\0"
+ "pciconfig_read\0"
+ "pciconfig_write\0"
+ "s390_pci_mmio_read\0"
+ "s390_pci_mmio_write\0"
+ },
+ [SYSCALL_FILTER_SET_REBOOT] = {
+ .name = "@reboot",
+ .help = "Reboot and reboot preparation/kexec",
+ .value =
+ "kexec_file_load\0"
+ "kexec_load\0"
+ "reboot\0"
+ },
+ [SYSCALL_FILTER_SET_RESOURCES] = {
+ .name = "@resources",
+ .help = "Alter resource settings",
+ .value =
+ "ioprio_set\0"
+ "mbind\0"
+ "migrate_pages\0"
+ "move_pages\0"
+ "nice\0"
+ "sched_setaffinity\0"
+ "sched_setattr\0"
+ "sched_setparam\0"
+ "sched_setscheduler\0"
+ "set_mempolicy\0"
+ "set_mempolicy_home_node\0"
+ "setpriority\0"
+ "setrlimit\0"
+ },
+ [SYSCALL_FILTER_SET_SANDBOX] = {
+ .name = "@sandbox",
+ .help = "Sandbox functionality",
+ .value =
+ "landlock_add_rule\0"
+ "landlock_create_ruleset\0"
+ "landlock_restrict_self\0"
+ "seccomp\0"
+ },
+ [SYSCALL_FILTER_SET_SETUID] = {
+ .name = "@setuid",
+ .help = "Operations for changing user/group credentials",
+ .value =
+ "setgid\0"
+ "setgid32\0"
+ "setgroups\0"
+ "setgroups32\0"
+ "setregid\0"
+ "setregid32\0"
+ "setresgid\0"
+ "setresgid32\0"
+ "setresuid\0"
+ "setresuid32\0"
+ "setreuid\0"
+ "setreuid32\0"
+ "setuid\0"
+ "setuid32\0"
+ },
+ [SYSCALL_FILTER_SET_SIGNAL] = {
+ .name = "@signal",
+ .help = "Process signal handling",
+ .value =
+ "rt_sigaction\0"
+ "rt_sigpending\0"
+ "rt_sigprocmask\0"
+ "rt_sigsuspend\0"
+ "rt_sigtimedwait\0"
+ "rt_sigtimedwait_time64\0"
+ "sigaction\0"
+ "sigaltstack\0"
+ "signal\0"
+ "signalfd\0"
+ "signalfd4\0"
+ "sigpending\0"
+ "sigprocmask\0"
+ "sigsuspend\0"
+ },
+ [SYSCALL_FILTER_SET_SWAP] = {
+ .name = "@swap",
+ .help = "Enable/disable swap devices",
+ .value =
+ "swapoff\0"
+ "swapon\0"
+ },
+ [SYSCALL_FILTER_SET_SYNC] = {
+ .name = "@sync",
+ .help = "Synchronize files and memory to storage",
+ .value =
+ "fdatasync\0"
+ "fsync\0"
+ "msync\0"
+ "sync\0"
+ "sync_file_range\0"
+ "sync_file_range2\0"
+ "syncfs\0"
+ },
+ [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
+ .name = "@system-service",
+ .help = "General system service operations",
+ .value =
+ "@aio\0"
+ "@basic-io\0"
+ "@chown\0"
+ "@default\0"
+ "@file-system\0"
+ "@io-event\0"
+ "@ipc\0"
+ "@keyring\0"
+ "@memlock\0"
+ "@network-io\0"
+ "@process\0"
+ "@resources\0"
+ "@setuid\0"
+ "@signal\0"
+ "@sync\0"
+ "@timer\0"
+ "arm_fadvise64_64\0"
+ "capget\0"
+ "capset\0"
+ "copy_file_range\0"
+ "fadvise64\0"
+ "fadvise64_64\0"
+ "flock\0"
+ "get_mempolicy\0"
+ "getcpu\0"
+ "getpriority\0"
+ "ioctl\0"
+ "ioprio_get\0"
+ "kcmp\0"
+ "madvise\0"
+ "mremap\0"
+ "name_to_handle_at\0"
+ "oldolduname\0"
+ "olduname\0"
+ "personality\0"
+ "readahead\0"
+ "readdir\0"
+ "remap_file_pages\0"
+ "sched_get_priority_max\0"
+ "sched_get_priority_min\0"
+ "sched_getattr\0"
+ "sched_getparam\0"
+ "sched_getscheduler\0"
+ "sched_rr_get_interval\0"
+ "sched_rr_get_interval_time64\0"
+ "sched_yield\0"
+ "sendfile\0"
+ "sendfile64\0"
+ "setfsgid\0"
+ "setfsgid32\0"
+ "setfsuid\0"
+ "setfsuid32\0"
+ "setpgid\0"
+ "setsid\0"
+ "splice\0"
+ "sysinfo\0"
+ "tee\0"
+ "umask\0"
+ "uname\0"
+ "userfaultfd\0"
+ "vmsplice\0"
+ },
+ [SYSCALL_FILTER_SET_TIMER] = {
+ .name = "@timer",
+ .help = "Schedule operations by time",
+ .value =
+ "alarm\0"
+ "getitimer\0"
+ "setitimer\0"
+ "timer_create\0"
+ "timer_delete\0"
+ "timer_getoverrun\0"
+ "timer_gettime\0"
+ "timer_gettime64\0"
+ "timer_settime\0"
+ "timer_settime64\0"
+ "timerfd_create\0"
+ "timerfd_gettime\0"
+ "timerfd_gettime64\0"
+ "timerfd_settime\0"
+ "timerfd_settime64\0"
+ "times\0"
+ },
+ [SYSCALL_FILTER_SET_KNOWN] = {
+ .name = "@known",
+ .help = "All known syscalls declared in the kernel",
+ .value =
+ "@obsolete\0"
+#include "syscall-list.h"
+ },
+};
+
+const SyscallFilterSet *syscall_filter_set_find(const char *name) {
+ if (isempty(name) || name[0] != '@')
+ return NULL;
+
+ for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
+ if (streq(syscall_filter_sets[i].name, name))
+ return syscall_filter_sets + i;
+
+ return NULL;
+}
+
+static int add_syscall_filter_set(
+ scmp_filter_ctx seccomp,
+ const SyscallFilterSet *set,
+ uint32_t action,
+ char **exclude,
+ bool log_missing,
+ char ***added);
+
+int seccomp_add_syscall_filter_item(
+ scmp_filter_ctx *seccomp,
+ const char *name,
+ uint32_t action,
+ char **exclude,
+ bool log_missing,
+ char ***added) {
+
+ assert(seccomp);
+ assert(name);
+
+ if (strv_contains(exclude, name))
+ return 0;
+
+ /* Any syscalls that are handled are added to the *added strv. The pointer
+ * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
+
+ if (name[0] == '@') {
+ const SyscallFilterSet *other;
+
+ other = syscall_filter_set_find(name);
+ if (!other)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Filter set %s is not known!",
+ name);
+
+ return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
+
+ } else {
+ int id, r;
+
+ id = seccomp_syscall_resolve_name(name);
+ if (id == __NR_SCMP_ERROR) {
+ if (log_missing)
+ log_debug("System call %s is not known, ignoring.", name);
+ return 0;
+ }
+
+ r = seccomp_rule_add_exact(seccomp, action, id, 0);
+ if (r < 0) {
+ /* If the system call is not known on this architecture, then that's fine, let's ignore it */
+ bool ignore = r == -EDOM;
+
+ if (!ignore || log_missing)
+ log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
+ name, id, ignore ? ", ignoring" : "");
+ if (!ignore)
+ return r;
+ }
+
+ if (added) {
+ r = strv_extend(added, name);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+ }
+}
+
+static int add_syscall_filter_set(
+ scmp_filter_ctx seccomp,
+ const SyscallFilterSet *set,
+ uint32_t action,
+ char **exclude,
+ bool log_missing,
+ char ***added) {
+
+ int r;
+
+ /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
+
+ assert(seccomp);
+ assert(set);
+
+ NULSTR_FOREACH(sys, set->value) {
+ r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static uint32_t override_default_action(uint32_t default_action) {
+ /* When the requested filter is an allow-list, and the default action is something critical, we
+ * install ENOSYS as the default action, but it will only apply to syscalls which are not in the
+ * @known set. */
+
+ if (default_action == SCMP_ACT_ALLOW)
+ return default_action;
+
+#ifdef SCMP_ACT_LOG
+ if (default_action == SCMP_ACT_LOG)
+ return default_action;
+#endif
+
+ return SCMP_ACT_ERRNO(ENOSYS);
+}
+
+int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
+ uint32_t arch, default_action_override;
+ int r;
+
+ assert(set);
+
+ /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
+ * each local arch. */
+
+ default_action_override = override_default_action(default_action);
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ _cleanup_strv_free_ char **added = NULL;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
+ if (r < 0)
+ return r;
+
+ r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, &added);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add filter set: %m");
+
+ if (default_action != default_action_override)
+ NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
+ int id;
+
+ id = seccomp_syscall_resolve_name(name);
+ if (id < 0)
+ continue;
+
+ /* Ignore the syscall if it was already handled above */
+ if (strv_contains(added, name))
+ continue;
+
+ r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
+ if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
+ return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
+ name, id);
+ }
+
+#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
+ /* We have a large filter here, so let's turn on the binary tree mode if possible. */
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
+#endif
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
+ uint32_t arch, default_action_override;
+ int r;
+
+ /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
+ * of a SyscallFilterSet* table. */
+
+ if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
+ return 0;
+
+ default_action_override = override_default_action(default_action);
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ void *syscall_id, *val;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
+ if (r < 0)
+ return r;
+
+ HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
+ uint32_t a = action;
+ int id = PTR_TO_INT(syscall_id) - 1;
+ int error = PTR_TO_INT(val);
+
+ if (error == SECCOMP_ERROR_NUMBER_KILL)
+ a = scmp_act_kill_process();
+#ifdef SCMP_ACT_LOG
+ else if (action == SCMP_ACT_LOG)
+ a = SCMP_ACT_LOG;
+#endif
+ else if (error >= 0)
+ a = SCMP_ACT_ERRNO(error);
+
+ r = seccomp_rule_add_exact(seccomp, a, id, 0);
+ if (r < 0) {
+ /* If the system call is not known on this architecture, then that's
+ * fine, let's ignore it */
+ _cleanup_free_ char *n = NULL;
+ bool ignore;
+
+ n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
+ ignore = r == -EDOM;
+ if (!ignore || log_missing)
+ log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
+ strna(n), id, ignore ? ", ignoring" : "");
+ if (!ignore)
+ return r;
+ }
+ }
+
+ if (default_action != default_action_override)
+ NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
+ int id;
+
+ id = seccomp_syscall_resolve_name(name);
+ if (id < 0)
+ continue;
+
+ /* Ignore the syscall if it was already handled above */
+ if (hashmap_contains(filter, INT_TO_PTR(id + 1)))
+ continue;
+
+ r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
+ if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
+ return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
+ name, id);
+ }
+
+#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
+ /* We have a large filter here, so let's turn on the binary tree mode if possible. */
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
+ if (r < 0)
+ log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
+#endif
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_parse_syscall_filter(
+ const char *name,
+ int errno_num,
+ Hashmap *filter,
+ SeccompParseFlags flags,
+ const char *unit,
+ const char *filename,
+ unsigned line) {
+
+ int r;
+
+ assert(name);
+ assert(filter);
+
+ if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
+ return -EINVAL;
+
+ if (name[0] == '@') {
+ const SyscallFilterSet *set;
+
+ set = syscall_filter_set_find(name);
+ if (!set) {
+ if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
+ return -EINVAL;
+
+ log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
+ "Unknown system call group, ignoring: %s", name);
+ return 0;
+ }
+
+ NULSTR_FOREACH(i, set->value) {
+ /* Call ourselves again, for the group to parse. Note that we downgrade logging here
+ * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
+ * are our own problem, not a problem in user configuration data and we shouldn't
+ * pretend otherwise by complaining about them. */
+ r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
+ if (r < 0)
+ return r;
+ }
+ } else {
+ int id;
+
+ id = seccomp_syscall_resolve_name(name);
+ if (id == __NR_SCMP_ERROR) {
+ if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
+ return -EINVAL;
+
+ log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
+ "System call %s is not known, ignoring.", name);
+ return 0;
+ }
+
+ /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
+ * from the list. The entries in allow-list with non-negative error value will be handled
+ * with SCMP_ACT_ERRNO() instead of the default action. */
+ if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
+ (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
+ r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
+ if (r < 0)
+ switch (r) {
+ case -ENOMEM:
+ return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
+ case -EEXIST:
+ assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
+ break;
+ default:
+ return r;
+ }
+ } else
+ (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_namespaces(unsigned long retain) {
+ uint32_t arch;
+ int r;
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *s = NULL;
+
+ (void) namespace_flags_to_string(retain, &s);
+ log_debug("Restricting namespace to: %s.", strna(s));
+ }
+
+ /* NOOP? */
+ if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
+ return 0;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ /* We cannot filter on individual flags to clone3(), and we need to disable the
+ * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
+ * users shall fall back to clone(), as if on an older kernel.
+ *
+ * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
+ * https://github.com/moby/moby/issues/42680. */
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(clone3),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m",
+ seccomp_arch_to_string(arch));
+
+ if ((retain & NAMESPACE_FLAGS_ALL) == 0)
+ /* If every single kind of namespace shall be prohibited, then let's block the whole
+ * setns() syscall altogether. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 0);
+ else
+ /* Otherwise, block only the invocations with the appropriate flags in the loop
+ * below, but also the special invocation with a zero flags argument, right here. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, 0));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ for (unsigned i = 0; namespace_info[i].proc_name; i++) {
+ unsigned long f;
+
+ f = namespace_info[i].clone_flag;
+ if (FLAGS_SET(retain, f)) {
+ log_debug("Permitting %s.", namespace_info[i].proc_name);
+ continue;
+ }
+
+ log_trace("Blocking %s.", namespace_info[i].proc_name);
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(unshare),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ break;
+ }
+
+ /* On s390/s390x the first two parameters to clone are switched */
+ if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
+ else
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(clone),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ break;
+ }
+
+ if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setns),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ break;
+ }
+ }
+ }
+ if (r < 0)
+ continue;
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_protect_sysctl(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ if (IN_SET(arch,
+ SCMP_ARCH_AARCH64,
+#ifdef SCMP_ARCH_LOONGARCH64
+ SCMP_ARCH_LOONGARCH64,
+#endif
+#ifdef SCMP_ARCH_RISCV64
+ SCMP_ARCH_RISCV64,
+#endif
+ SCMP_ARCH_X32
+ ))
+ /* No _sysctl syscall */
+ continue;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(_sysctl),
+ 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_protect_syslog(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(syslog),
+ 0);
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ bool supported;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ switch (arch) {
+
+ case SCMP_ARCH_X86_64:
+ case SCMP_ARCH_X32:
+ case SCMP_ARCH_ARM:
+ case SCMP_ARCH_AARCH64:
+#ifdef SCMP_ARCH_LOONGARCH64
+ case SCMP_ARCH_LOONGARCH64:
+#endif
+ case SCMP_ARCH_MIPSEL64N32:
+ case SCMP_ARCH_MIPS64N32:
+ case SCMP_ARCH_MIPSEL64:
+ case SCMP_ARCH_MIPS64:
+#ifdef SCMP_ARCH_RISCV64
+ case SCMP_ARCH_RISCV64:
+#endif
+ /* These we know we support (i.e. are the ones that do not use socketcall()) */
+ supported = true;
+ break;
+
+ case SCMP_ARCH_S390:
+ case SCMP_ARCH_S390X:
+ case SCMP_ARCH_X86:
+ case SCMP_ARCH_MIPSEL:
+ case SCMP_ARCH_MIPS:
+#ifdef SCMP_ARCH_PARISC
+ case SCMP_ARCH_PARISC:
+#endif
+#ifdef SCMP_ARCH_PARISC64
+ case SCMP_ARCH_PARISC64:
+#endif
+ case SCMP_ARCH_PPC:
+ case SCMP_ARCH_PPC64:
+ case SCMP_ARCH_PPC64LE:
+ default:
+ /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
+ * don't know */
+ supported = false;
+ break;
+ }
+
+ if (!supported)
+ continue;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ if (allow_list) {
+ int first = 0, last = 0;
+ void *afp;
+
+ /* If this is an allow list, we first block the address families that are out of
+ * range and then everything that is not in the set. First, we find the lowest and
+ * highest address family in the set. */
+
+ SET_FOREACH(afp, address_families) {
+ int af = PTR_TO_INT(afp);
+
+ if (af <= 0 || af >= af_max())
+ continue;
+
+ if (first == 0 || af < first)
+ first = af;
+
+ if (last == 0 || af > last)
+ last = af;
+ }
+
+ assert((first == 0) == (last == 0));
+
+ if (first == 0) {
+
+ /* No entries in the valid range, block everything */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ } else {
+
+ /* Block everything below the first entry */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_LT, first));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ /* Block everything above the last entry */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_GT, last));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ /* Block everything between the first and last entry */
+ for (int af = 1; af < af_max(); af++) {
+
+ if (set_contains(address_families, INT_TO_PTR(af)))
+ continue;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_EQ, af));
+ if (r < 0)
+ break;
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ } else {
+ void *af;
+
+ /* If this is a deny list, then generate one rule for each address family that are
+ * then combined in OR checks. */
+
+ SET_FOREACH(af, address_families) {
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EAFNOSUPPORT),
+ SCMP_SYS(socket),
+ 1,
+ SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
+ if (r < 0)
+ break;
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_realtime_full(int error_code) {
+ static const int permitted_policies[] = {
+ SCHED_OTHER,
+ SCHED_BATCH,
+ SCHED_IDLE,
+ };
+
+ int r, max_policy = 0;
+ uint32_t arch;
+ unsigned i;
+
+ assert(error_code > 0);
+
+ /* Determine the highest policy constant we want to allow */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] > max_policy)
+ max_policy = permitted_policies[i];
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int p;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ /* Go through all policies with lower values than that, and block them -- unless they appear in the
+ * allow list. */
+ for (p = 0; p < max_policy; p++) {
+ bool good = false;
+
+ /* Check if this is in the allow list. */
+ for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+ if (permitted_policies[i] == p) {
+ good = true;
+ break;
+ }
+
+ if (good)
+ continue;
+
+ /* Deny this policy */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(error_code),
+ SCMP_SYS(sched_setscheduler),
+ 1,
+ SCMP_A1(SCMP_CMP_EQ, p));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+ }
+
+ /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
+ * are unsigned here, hence no need no check for < 0 values. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(error_code),
+ SCMP_SYS(sched_setscheduler),
+ 1,
+ SCMP_A1(SCMP_CMP_GT, max_policy));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
+ uint32_t arch,
+ int nr,
+ unsigned arg_cnt,
+ const struct scmp_arg_cmp arg) {
+ int r;
+
+ r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
+ if (r < 0) {
+ _cleanup_free_ char *n = NULL;
+
+ n = seccomp_syscall_resolve_num_arch(arch, nr);
+ log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
+ strna(n),
+ seccomp_arch_to_string(arch));
+ }
+
+ return r;
+}
+
+/* For known architectures, check that syscalls are indeed defined or not. */
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
+assert_cc(SCMP_SYS(shmget) > 0);
+assert_cc(SCMP_SYS(shmat) > 0);
+assert_cc(SCMP_SYS(shmdt) > 0);
+#endif
+
+int seccomp_memory_deny_write_execute(void) {
+ uint32_t arch;
+ unsigned loaded = 0;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
+
+ log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
+
+ switch (arch) {
+
+ /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
+ * We ignore that here, which means there's still a way to get writable/executable
+ * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
+ *
+ * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
+ * on that front (kernel work done in 5.18).
+ */
+
+ case SCMP_ARCH_X86:
+ case SCMP_ARCH_S390:
+ filter_syscall = SCMP_SYS(mmap2);
+ block_syscall = SCMP_SYS(mmap);
+ /* shmat multiplexed, see above */
+ break;
+
+ case SCMP_ARCH_PPC:
+ case SCMP_ARCH_PPC64:
+ case SCMP_ARCH_PPC64LE:
+ case SCMP_ARCH_S390X:
+ filter_syscall = SCMP_SYS(mmap);
+ /* shmat multiplexed, see above */
+ break;
+
+ case SCMP_ARCH_ARM:
+ filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
+ shmat_syscall = SCMP_SYS(shmat);
+ break;
+
+ case SCMP_ARCH_X86_64:
+ case SCMP_ARCH_X32:
+ case SCMP_ARCH_AARCH64:
+#ifdef SCMP_ARCH_LOONGARCH64
+ case SCMP_ARCH_LOONGARCH64:
+#endif
+#ifdef SCMP_ARCH_RISCV64
+ case SCMP_ARCH_RISCV64:
+#endif
+ filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
+ shmat_syscall = SCMP_SYS(shmat);
+ break;
+
+ /* Please add more definitions here, if you port systemd to other architectures! */
+
+#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
+#warning "Consider adding the right mmap() syscall definitions here!"
+#endif
+ }
+
+ /* Can't filter mmap() on this arch, then skip it */
+ if (filter_syscall == 0)
+ continue;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
+ if (r < 0)
+ continue;
+
+ if (block_syscall != 0) {
+ r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
+ if (r < 0)
+ continue;
+ }
+
+ r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
+ if (r < 0)
+ continue;
+
+ r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
+ if (r < 0)
+ continue;
+
+ if (shmat_syscall > 0) {
+ r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
+ if (r < 0)
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ loaded++;
+ }
+
+ if (loaded == 0)
+ log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
+
+ return loaded;
+}
+
+int seccomp_restrict_archs(Set *archs) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+ int r;
+ bool blocked_new = false;
+
+ /* This installs a filter with no rules, but that restricts the system call architectures to the specified
+ * list.
+ *
+ * There are some qualifications. However the most important use is to stop processes from bypassing
+ * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
+ * in a non-native architecture. There are no holes in this use case, at least so far. */
+
+ /* Note libseccomp includes our "native" (current) architecture in the filter by default.
+ * We do not remove it. For example, our callers expect to be able to call execve() afterwards
+ * to run a program with the restrictions applied. */
+ seccomp = seccomp_init(SCMP_ACT_ALLOW);
+ if (!seccomp)
+ return -ENOMEM;
+
+ for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
+ uint32_t arch = seccomp_local_archs[i];
+
+ /* See above comment, our "native" architecture is never blocked. */
+ if (arch == seccomp_arch_native())
+ continue;
+
+ /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
+ if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
+ continue;
+
+ bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
+
+ /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
+ * x32 syscalls should basically match x86-64 for everything except the pointer type.
+ * The important thing is that you can block the old 32-bit x86 syscalls.
+ * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
+ if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
+ block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
+
+ if (block) {
+ seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
+ blocked_new = true;
+ } else {
+ r = seccomp_arch_add(seccomp, arch);
+ if (r < 0 && r != -EEXIST)
+ return r;
+ }
+ }
+
+ /* All architectures that will be blocked by the seccomp program were
+ * already blocked. */
+ if (!blocked_new)
+ return 0;
+
+ r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+ if (r < 0)
+ return r;
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
+
+ return 0;
+}
+
+int parse_syscall_archs(char **l, Set **ret_archs) {
+ _cleanup_set_free_ Set *archs = NULL;
+ int r;
+
+ assert(l);
+ assert(ret_archs);
+
+ STRV_FOREACH(s, l) {
+ uint32_t a;
+
+ r = seccomp_arch_from_string(*s, &a);
+ if (r < 0)
+ return -EINVAL;
+
+ r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
+ if (r < 0)
+ return -ENOMEM;
+ }
+
+ *ret_archs = TAKE_PTR(archs);
+ return 0;
+}
+
+int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
+ int r;
+
+ assert(set);
+
+ NULSTR_FOREACH(i, set->value) {
+
+ if (i[0] == '@') {
+ const SyscallFilterSet *more;
+
+ more = syscall_filter_set_find(i);
+ if (!more)
+ return -ENXIO;
+
+ r = seccomp_filter_set_add(filter, add, more);
+ if (r < 0)
+ return r;
+ } else {
+ int id;
+
+ id = seccomp_syscall_resolve_name(i);
+ if (id == __NR_SCMP_ERROR) {
+ log_debug("System call %s is not known, ignoring.", i);
+ continue;
+ }
+
+ if (add) {
+ r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
+ if (r < 0)
+ return r;
+ } else
+ (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
+ }
+ }
+
+ return 0;
+}
+
+int seccomp_lock_personality(unsigned long personality) {
+ uint32_t arch;
+ int r;
+
+ if (personality >= PERSONALITY_INVALID)
+ return -EINVAL;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(personality),
+ 1,
+ SCMP_A0(SCMP_CMP_NE, personality));
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_protect_hostname(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(sethostname),
+ 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(setdomainname),
+ 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
+ /* Checks the mode_t parameter of the following system calls:
+ *
+ * → chmod() + fchmod() + fchmodat() + fchmodat2()
+ * → open() + creat() + openat()
+ * → mkdir() + mkdirat()
+ * → mknod() + mknodat()
+ *
+ * Returns error if *everything* failed, and 0 otherwise.
+ */
+ int r;
+ bool any = false;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(chmod),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for chmod: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(fchmod),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for fchmod: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(fchmodat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for fchmodat: %m");
+ else
+ any = true;
+
+#if defined(__SNR_fchmodat2)
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(fchmodat2),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
+#else
+ /* It looks like this libseccomp does not know about fchmodat2().
+ * Pretend the fchmodat2() system call is not supported at all,
+ * regardless of the kernel version. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ __NR_fchmodat2,
+ 0);
+#endif
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for fchmodat2: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mkdir),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for mkdir: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mkdirat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for mkdirat: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mknod),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for mknod: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(mknodat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for mknodat: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(open),
+ 2,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
+ SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for open: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(openat),
+ 2,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
+ SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat: %m");
+ else
+ any = true;
+
+#if defined(__SNR_openat2)
+ /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
+ * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
+ * for now, since openat2() is very new and code generally needs fallback logic anyway to be
+ * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
+ * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
+ * to call open() or openat() instead. We can properly enforce policy for those functions. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(openat2),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat2: %m");
+ else
+ any = true;
+#endif
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(creat),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for creat: %m");
+ else
+ any = true;
+
+ return any ? 0 : r;
+}
+
+int seccomp_restrict_suid_sgid(void) {
+ uint32_t arch;
+ int r, k;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_restrict_sxid(seccomp, S_ISUID);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m",
+ seccomp_arch_to_string(arch));
+
+ k = seccomp_restrict_sxid(seccomp, S_ISGID);
+ if (k < 0)
+ log_debug_errno(k, "Failed to add sgid rule for architecture %s, ignoring: %m",
+ seccomp_arch_to_string(arch));
+
+ if (r < 0 && k < 0)
+ continue;
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+uint32_t scmp_act_kill_process(void) {
+
+ /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
+ * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
+ * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
+ * for single-threaded apps does the right thing. */
+
+#ifdef SCMP_ACT_KILL_PROCESS
+ if (seccomp_api_get() >= 3)
+ return SCMP_ACT_KILL_PROCESS;
+#endif
+
+ return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
+}
+
+int parse_syscall_and_errno(const char *in, char **name, int *error) {
+ _cleanup_free_ char *n = NULL;
+ char *p;
+ int e = -1;
+
+ assert(in);
+ assert(name);
+ assert(error);
+
+ /*
+ * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
+ * If errno is omitted, then error is set to -1.
+ * Empty syscall name is not allowed.
+ * Here, we do not check that the syscall name is valid or not.
+ */
+
+ p = strchr(in, ':');
+ if (p) {
+ e = seccomp_parse_errno_or_action(p + 1);
+ if (e < 0)
+ return e;
+
+ n = strndup(in, p - in);
+ } else
+ n = strdup(in);
+
+ if (!n)
+ return -ENOMEM;
+
+ if (isempty(n))
+ return -EINVAL;
+
+ *error = e;
+ *name = TAKE_PTR(n);
+
+ return 0;
+}
+
+static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
+ bool any = false;
+ int r;
+
+ /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
+ * EINVAL, in the hope the client code will retry without O_SYNC then. */
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EINVAL),
+ SCMP_SYS(open),
+ 1,
+ SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for open: %m");
+ else
+ any = true;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EINVAL),
+ SCMP_SYS(openat),
+ 1,
+ SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat: %m");
+ else
+ any = true;
+
+#if defined(__SNR_openat2)
+ /* The new openat2() system call can't be filtered sensibly, see above. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(openat2),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat2: %m");
+ else
+ any = true;
+#endif
+
+ return any ? 0 : r;
+}
+
+int seccomp_suppress_sync(void) {
+ uint32_t arch;
+ int r;
+
+ /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
+ * manageable, and also masks O_SYNC/O_DSYNC */
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
+ int id;
+
+ id = seccomp_syscall_resolve_name(c);
+ if (id == __NR_SCMP_ERROR) {
+ log_debug("System call %s is not known, ignoring.", c);
+ continue;
+ }
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
+ id,
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
+ }
+
+ (void) block_open_flag(seccomp, O_SYNC);
+#if O_DSYNC != O_SYNC
+ (void) block_open_flag(seccomp, O_DSYNC);
+#endif
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
diff --git a/src/shared/seccomp-util.h b/src/shared/seccomp-util.h
new file mode 100644
index 0000000..7583357
--- /dev/null
+++ b/src/shared/seccomp-util.h
@@ -0,0 +1,180 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#if HAVE_SECCOMP
+#include <seccomp.h>
+#endif
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "errno-list.h"
+#include "errno-util.h"
+#include "parse-util.h"
+#include "set.h"
+#include "string-util.h"
+
+#if HAVE_SECCOMP
+
+const char* seccomp_arch_to_string(uint32_t c);
+int seccomp_arch_from_string(const char *n, uint32_t *ret);
+
+int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action);
+
+bool is_seccomp_available(void);
+
+typedef struct SyscallFilterSet {
+ const char *name;
+ const char *help;
+ const char *value;
+} SyscallFilterSet;
+
+enum {
+ /* Please leave DEFAULT first and KNOWN last, but sort the rest alphabetically */
+ SYSCALL_FILTER_SET_DEFAULT,
+ SYSCALL_FILTER_SET_AIO,
+ SYSCALL_FILTER_SET_BASIC_IO,
+ SYSCALL_FILTER_SET_CHOWN,
+ SYSCALL_FILTER_SET_CLOCK,
+ SYSCALL_FILTER_SET_CPU_EMULATION,
+ SYSCALL_FILTER_SET_DEBUG,
+ SYSCALL_FILTER_SET_FILE_SYSTEM,
+ SYSCALL_FILTER_SET_IO_EVENT,
+ SYSCALL_FILTER_SET_IPC,
+ SYSCALL_FILTER_SET_KEYRING,
+ SYSCALL_FILTER_SET_MEMLOCK,
+ SYSCALL_FILTER_SET_MODULE,
+ SYSCALL_FILTER_SET_MOUNT,
+ SYSCALL_FILTER_SET_NETWORK_IO,
+ SYSCALL_FILTER_SET_OBSOLETE,
+ SYSCALL_FILTER_SET_PKEY,
+ SYSCALL_FILTER_SET_PRIVILEGED,
+ SYSCALL_FILTER_SET_PROCESS,
+ SYSCALL_FILTER_SET_RAW_IO,
+ SYSCALL_FILTER_SET_REBOOT,
+ SYSCALL_FILTER_SET_RESOURCES,
+ SYSCALL_FILTER_SET_SANDBOX,
+ SYSCALL_FILTER_SET_SETUID,
+ SYSCALL_FILTER_SET_SIGNAL,
+ SYSCALL_FILTER_SET_SWAP,
+ SYSCALL_FILTER_SET_SYNC,
+ SYSCALL_FILTER_SET_SYSTEM_SERVICE,
+ SYSCALL_FILTER_SET_TIMER,
+ SYSCALL_FILTER_SET_KNOWN,
+ _SYSCALL_FILTER_SET_MAX,
+};
+
+assert_cc(SYSCALL_FILTER_SET_DEFAULT == 0);
+assert_cc(SYSCALL_FILTER_SET_KNOWN == _SYSCALL_FILTER_SET_MAX-1);
+
+extern const SyscallFilterSet syscall_filter_sets[];
+
+const SyscallFilterSet *syscall_filter_set_find(const char *name);
+
+int seccomp_filter_set_add(Hashmap *s, bool b, const SyscallFilterSet *set);
+
+int seccomp_add_syscall_filter_item(
+ scmp_filter_ctx *ctx,
+ const char *name,
+ uint32_t action,
+ char **exclude,
+ bool log_missing,
+ char ***added);
+
+int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing);
+int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing);
+
+typedef enum SeccompParseFlags {
+ SECCOMP_PARSE_INVERT = 1 << 0,
+ SECCOMP_PARSE_ALLOW_LIST = 1 << 1,
+ SECCOMP_PARSE_LOG = 1 << 2,
+ SECCOMP_PARSE_PERMISSIVE = 1 << 3,
+} SeccompParseFlags;
+
+int seccomp_parse_syscall_filter(
+ const char *name,
+ int errno_num,
+ Hashmap *filter,
+ SeccompParseFlags flags,
+ const char *unit,
+ const char *filename, unsigned line);
+
+int seccomp_restrict_archs(Set *archs);
+int seccomp_restrict_namespaces(unsigned long retain);
+int seccomp_protect_sysctl(void);
+int seccomp_protect_syslog(void);
+int seccomp_restrict_address_families(Set *address_families, bool allow_list);
+int seccomp_restrict_realtime_full(int error_code); /* This is mostly for testing code. */
+static inline int seccomp_restrict_realtime(void) {
+ return seccomp_restrict_realtime_full(EPERM);
+}
+int seccomp_memory_deny_write_execute(void);
+int seccomp_lock_personality(unsigned long personality);
+int seccomp_protect_hostname(void);
+int seccomp_restrict_suid_sgid(void);
+
+extern uint32_t seccomp_local_archs[];
+
+#define SECCOMP_LOCAL_ARCH_END UINT32_MAX
+
+/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would
+ * never be in the seccomp_local_archs array anyway so we can use it as a
+ * marker. */
+#define SECCOMP_LOCAL_ARCH_BLOCKED 0
+
+#define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
+ for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; }); \
+ (arch) != SECCOMP_LOCAL_ARCH_END; \
+ (arch) = seccomp_local_archs[++_i]) \
+ if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED)
+
+/* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1
+ * ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant
+ * EFAULT: addresses passed as args (by libseccomp) are invalid */
+static inline bool ERRNO_IS_NEG_SECCOMP_FATAL(intmax_t r) {
+ return IN_SET(r,
+ -EPERM,
+ -EACCES,
+ -ENOMEM,
+ -EFAULT);
+}
+_DEFINE_ABS_WRAPPER(SECCOMP_FATAL);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(scmp_filter_ctx, seccomp_release, NULL);
+
+int parse_syscall_archs(char **l, Set **ret_archs);
+
+uint32_t scmp_act_kill_process(void);
+
+int parse_syscall_and_errno(const char *in, char **name, int *error);
+
+int seccomp_suppress_sync(void);
+
+#else
+
+static inline bool is_seccomp_available(void) {
+ return false;
+}
+
+#endif
+
+/* This is a special value to be used where syscall filters otherwise expect errno numbers, will be
+ replaced with real seccomp action. */
+enum {
+ SECCOMP_ERROR_NUMBER_KILL = INT_MAX - 1,
+};
+
+static inline bool seccomp_errno_or_action_is_valid(int n) {
+ return n == SECCOMP_ERROR_NUMBER_KILL || errno_is_valid(n);
+}
+
+static inline int seccomp_parse_errno_or_action(const char *p) {
+ if (streq_ptr(p, "kill"))
+ return SECCOMP_ERROR_NUMBER_KILL;
+ return parse_errno(p);
+}
+
+static inline const char *seccomp_errno_or_action_to_string(int num) {
+ if (num == SECCOMP_ERROR_NUMBER_KILL)
+ return "kill";
+ return errno_to_name(num);
+}
diff --git a/src/shared/securebits-util.c b/src/shared/securebits-util.c
new file mode 100644
index 0000000..c867807
--- /dev/null
+++ b/src/shared/securebits-util.c
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "extract-word.h"
+#include "securebits-util.h"
+#include "string-util.h"
+
+int secure_bits_to_string_alloc(int i, char **s) {
+ _cleanup_free_ char *str = NULL;
+ size_t len;
+ int r;
+
+ assert(s);
+
+ r = asprintf(&str, "%s%s%s%s%s%s",
+ (i & (1 << SECURE_KEEP_CAPS)) ? "keep-caps " : "",
+ (i & (1 << SECURE_KEEP_CAPS_LOCKED)) ? "keep-caps-locked " : "",
+ (i & (1 << SECURE_NO_SETUID_FIXUP)) ? "no-setuid-fixup " : "",
+ (i & (1 << SECURE_NO_SETUID_FIXUP_LOCKED)) ? "no-setuid-fixup-locked " : "",
+ (i & (1 << SECURE_NOROOT)) ? "noroot " : "",
+ (i & (1 << SECURE_NOROOT_LOCKED)) ? "noroot-locked " : "");
+ if (r < 0)
+ return -ENOMEM;
+
+ len = strlen(str);
+ if (len != 0)
+ str[len - 1] = '\0';
+
+ *s = TAKE_PTR(str);
+
+ return 0;
+}
+
+int secure_bits_from_string(const char *s) {
+ int secure_bits = 0;
+ const char *p;
+ int r;
+
+ for (p = s;;) {
+ _cleanup_free_ char *word = NULL;
+
+ r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
+ if (r == -ENOMEM)
+ return r;
+ if (r <= 0)
+ break;
+
+ if (streq(word, "keep-caps"))
+ secure_bits |= 1 << SECURE_KEEP_CAPS;
+ else if (streq(word, "keep-caps-locked"))
+ secure_bits |= 1 << SECURE_KEEP_CAPS_LOCKED;
+ else if (streq(word, "no-setuid-fixup"))
+ secure_bits |= 1 << SECURE_NO_SETUID_FIXUP;
+ else if (streq(word, "no-setuid-fixup-locked"))
+ secure_bits |= 1 << SECURE_NO_SETUID_FIXUP_LOCKED;
+ else if (streq(word, "noroot"))
+ secure_bits |= 1 << SECURE_NOROOT;
+ else if (streq(word, "noroot-locked"))
+ secure_bits |= 1 << SECURE_NOROOT_LOCKED;
+ }
+
+ return secure_bits;
+}
diff --git a/src/shared/securebits-util.h b/src/shared/securebits-util.h
new file mode 100644
index 0000000..caf8e6d
--- /dev/null
+++ b/src/shared/securebits-util.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "missing_securebits.h"
+
+int secure_bits_to_string_alloc(int i, char **s);
+int secure_bits_from_string(const char *s);
+
+static inline bool secure_bits_is_valid(int i) {
+ return ((SECURE_ALL_BITS | SECURE_ALL_LOCKS) & i) == i;
+}
+
+static inline int secure_bits_to_string_alloc_with_check(int n, char **s) {
+ if (!secure_bits_is_valid(n))
+ return -EINVAL;
+
+ return secure_bits_to_string_alloc(n, s);
+}
diff --git a/src/shared/selinux-util.c b/src/shared/selinux-util.c
new file mode 100644
index 0000000..2fef29c
--- /dev/null
+++ b/src/shared/selinux-util.c
@@ -0,0 +1,762 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <syslog.h>
+
+#if HAVE_SELINUX
+#include <selinux/avc.h>
+#include <selinux/context.h>
+#include <selinux/label.h>
+#include <selinux/selinux.h>
+#endif
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "label.h"
+#include "log.h"
+#include "macro.h"
+#include "mallinfo-util.h"
+#include "path-util.h"
+#include "selinux-util.h"
+#include "stdio-util.h"
+#include "time-util.h"
+
+#if HAVE_SELINUX
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(context_t, context_free, NULL);
+#define _cleanup_context_free_ _cleanup_(context_freep)
+
+typedef enum Initialized {
+ UNINITIALIZED,
+ INITIALIZED,
+ LAZY_INITIALIZED,
+} Initialized;
+
+static int mac_selinux_reload(int seqno);
+
+static int cached_use = -1;
+static Initialized initialized = UNINITIALIZED;
+static int last_policyload = 0;
+static struct selabel_handle *label_hnd = NULL;
+static bool have_status_page = false;
+
+#define log_enforcing(...) \
+ log_full(mac_selinux_enforcing() ? LOG_ERR : LOG_WARNING, __VA_ARGS__)
+
+#define log_enforcing_errno(error, ...) \
+ ({ \
+ bool _enforcing = mac_selinux_enforcing(); \
+ int _level = _enforcing ? LOG_ERR : LOG_WARNING; \
+ int _e = (error); \
+ \
+ int _r = (log_get_max_level() >= LOG_PRI(_level)) \
+ ? log_internal(_level, _e, PROJECT_FILE, __LINE__, __func__, __VA_ARGS__) \
+ : -ERRNO_VALUE(_e); \
+ _enforcing ? _r : 0; \
+ })
+
+static int mac_selinux_label_pre(int dir_fd, const char *path, mode_t mode) {
+ return mac_selinux_create_file_prepare_at(dir_fd, path, mode);
+}
+
+static int mac_selinux_label_post(int dir_fd, const char *path) {
+ mac_selinux_create_file_clear();
+ return 0;
+}
+#endif
+
+bool mac_selinux_use(void) {
+#if HAVE_SELINUX
+ if (_unlikely_(cached_use < 0)) {
+ cached_use = is_selinux_enabled() > 0;
+ log_trace("SELinux enabled state cached to: %s", enabled_disabled(cached_use));
+ }
+
+ return cached_use;
+#else
+ return false;
+#endif
+}
+
+bool mac_selinux_enforcing(void) {
+ int r = 0;
+#if HAVE_SELINUX
+
+ /* If the SELinux status page has been successfully opened, retrieve the enforcing
+ * status over it to avoid system calls in security_getenforce(). */
+
+ if (have_status_page)
+ r = selinux_status_getenforce();
+ else
+ r = security_getenforce();
+
+#endif
+ return r != 0;
+}
+
+void mac_selinux_retest(void) {
+#if HAVE_SELINUX
+ cached_use = -1;
+#endif
+}
+
+#if HAVE_SELINUX
+static int open_label_db(void) {
+ struct selabel_handle *hnd;
+ /* Avoid maybe-uninitialized false positives */
+ usec_t before_timestamp = USEC_INFINITY, after_timestamp = USEC_INFINITY;
+# if HAVE_GENERIC_MALLINFO
+ generic_mallinfo before_mallinfo = {};
+# endif
+
+ if (DEBUG_LOGGING) {
+# if HAVE_GENERIC_MALLINFO
+ before_mallinfo = generic_mallinfo_get();
+# endif
+ before_timestamp = now(CLOCK_MONOTONIC);
+ }
+
+ hnd = selabel_open(SELABEL_CTX_FILE, NULL, 0);
+ if (!hnd)
+ return log_enforcing_errno(errno, "Failed to initialize SELinux labeling handle: %m");
+
+ if (DEBUG_LOGGING) {
+ after_timestamp = now(CLOCK_MONOTONIC);
+# if HAVE_GENERIC_MALLINFO
+ generic_mallinfo after_mallinfo = generic_mallinfo_get();
+ size_t l = LESS_BY((size_t) after_mallinfo.uordblks, (size_t) before_mallinfo.uordblks);
+ log_debug("Successfully loaded SELinux database in %s, size on heap is %zuK.",
+ FORMAT_TIMESPAN(after_timestamp - before_timestamp, 0),
+ DIV_ROUND_UP(l, 1024));
+# else
+ log_debug("Successfully loaded SELinux database in %s.",
+ FORMAT_TIMESPAN(after_timestamp - before_timestamp, 0));
+# endif
+ }
+
+ /* release memory after measurement */
+ if (label_hnd)
+ selabel_close(label_hnd);
+ label_hnd = TAKE_PTR(hnd);
+
+ return 0;
+}
+#endif
+
+static int selinux_init(bool force) {
+#if HAVE_SELINUX
+ static const LabelOps label_ops = {
+ .pre = mac_selinux_label_pre,
+ .post = mac_selinux_label_post,
+ };
+ int r;
+
+ if (!mac_selinux_use())
+ return 0;
+
+ if (initialized == INITIALIZED)
+ return 1;
+
+ /* Internal call from this module? Unless we were explicitly configured to allow lazy initialization
+ * bail out immediately. Pretend all is good, we do not want callers to abort here, for example at
+ * early boot when the policy is being initialised. */
+ if (!force && initialized != LAZY_INITIALIZED)
+ return 1;
+
+ r = selinux_status_open(/* netlink fallback */ 1);
+ if (r < 0) {
+ if (!ERRNO_IS_PRIVILEGE(errno))
+ return log_enforcing_errno(errno, "Failed to open SELinux status page: %m");
+ log_warning_errno(errno, "selinux_status_open() with netlink fallback failed, not checking for policy reloads: %m");
+ } else if (r == 1)
+ log_warning("selinux_status_open() failed to open the status page, using the netlink fallback.");
+ else
+ have_status_page = true;
+
+ r = open_label_db();
+ if (r < 0) {
+ selinux_status_close();
+ return r;
+ }
+
+ r = label_ops_set(&label_ops);
+ if (r < 0)
+ return r;
+
+ /* Save the current policyload sequence number, so mac_selinux_maybe_reload() does not trigger on
+ * first call without any actual change. */
+ last_policyload = selinux_status_policyload();
+
+ initialized = INITIALIZED;
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+int mac_selinux_init(void) {
+ return selinux_init(/* force= */ true);
+}
+
+int mac_selinux_init_lazy(void) {
+#if HAVE_SELINUX
+ if (initialized == UNINITIALIZED)
+ initialized = LAZY_INITIALIZED; /* We'll be back later */
+#endif
+
+ return 0;
+}
+
+void mac_selinux_maybe_reload(void) {
+#if HAVE_SELINUX
+ int policyload;
+
+ if (!initialized)
+ return;
+
+ /* Do not use selinux_status_updated(3), cause since libselinux 3.2 selinux_check_access(3),
+ * called in core and user instances, does also use it under the hood.
+ * That can cause changes to be consumed by selinux_check_access(3) and not being visible here.
+ * Also do not use selinux callbacks, selinux_set_callback(3), cause they are only automatically
+ * invoked since libselinux 3.2 by selinux_status_updated(3).
+ * Relevant libselinux commit: https://github.com/SELinuxProject/selinux/commit/05bdc03130d741e53e1fb45a958d0a2c184be503
+ * Debian Bullseye is going to ship libselinux 3.1, so stay compatible for backports. */
+ policyload = selinux_status_policyload();
+ if (policyload < 0) {
+ log_debug_errno(errno, "Failed to get SELinux policyload from status page: %m");
+ return;
+ }
+
+ if (policyload != last_policyload) {
+ mac_selinux_reload(policyload);
+ last_policyload = policyload;
+ }
+#endif
+}
+
+void mac_selinux_finish(void) {
+
+#if HAVE_SELINUX
+ if (label_hnd) {
+ selabel_close(label_hnd);
+ label_hnd = NULL;
+ }
+
+ selinux_status_close();
+ have_status_page = false;
+
+ initialized = false;
+#endif
+}
+
+#if HAVE_SELINUX
+static int mac_selinux_reload(int seqno) {
+ log_debug("SELinux reload %d", seqno);
+
+ (void) open_label_db();
+
+ return 0;
+}
+#endif
+
+#if HAVE_SELINUX
+static int selinux_fix_fd(
+ int fd,
+ const char *label_path,
+ LabelFixFlags flags) {
+
+ _cleanup_freecon_ char* fcon = NULL;
+ struct stat st;
+ int r;
+
+ assert(fd >= 0);
+ assert(label_path);
+ assert(path_is_absolute(label_path));
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /* Check for policy reload so 'label_hnd' is kept up-to-date by callbacks */
+ mac_selinux_maybe_reload();
+ if (!label_hnd)
+ return 0;
+
+ if (selabel_lookup_raw(label_hnd, &fcon, label_path, st.st_mode) < 0) {
+ /* If there's no label to set, then exit without warning */
+ if (errno == ENOENT)
+ return 0;
+
+ return log_enforcing_errno(errno, "Unable to lookup intended SELinux security context of %s: %m", label_path);
+ }
+
+ if (setfilecon_raw(FORMAT_PROC_FD_PATH(fd), fcon) < 0) {
+ _cleanup_freecon_ char *oldcon = NULL;
+
+ r = -errno;
+
+ /* If the FS doesn't support labels, then exit without warning */
+ if (ERRNO_IS_NOT_SUPPORTED(r))
+ return 0;
+
+ /* It the FS is read-only and we were told to ignore failures caused by that, suppress error */
+ if (r == -EROFS && (flags & LABEL_IGNORE_EROFS))
+ return 0;
+
+ /* If the old label is identical to the new one, suppress any kind of error */
+ if (getfilecon_raw(FORMAT_PROC_FD_PATH(fd), &oldcon) >= 0 && streq_ptr(fcon, oldcon))
+ return 0;
+
+ return log_enforcing_errno(r, "Unable to fix SELinux security context of %s: %m", label_path);
+ }
+
+ return 0;
+}
+#endif
+
+int mac_selinux_fix_full(
+ int atfd,
+ const char *inode_path,
+ const char *label_path,
+ LabelFixFlags flags) {
+
+ assert(atfd >= 0 || atfd == AT_FDCWD);
+ assert(atfd >= 0 || inode_path);
+
+#if HAVE_SELINUX
+ _cleanup_close_ int opened_fd = -EBADF;
+ _cleanup_free_ char *p = NULL;
+ int inode_fd, r;
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ if (!label_hnd)
+ return 0;
+
+ if (inode_path) {
+ opened_fd = openat(atfd, inode_path, O_NOFOLLOW|O_CLOEXEC|O_PATH);
+ if (opened_fd < 0) {
+ if ((flags & LABEL_IGNORE_ENOENT) && errno == ENOENT)
+ return 0;
+
+ return -errno;
+ }
+
+ inode_fd = opened_fd;
+ } else
+ inode_fd = atfd;
+
+ if (!label_path) {
+ if (path_is_absolute(inode_path))
+ label_path = inode_path;
+ else {
+ r = fd_get_path(inode_fd, &p);
+ if (r < 0)
+ return r;
+
+ label_path = p;
+ }
+ }
+
+ return selinux_fix_fd(inode_fd, label_path, flags);
+#else
+ return 0;
+#endif
+}
+
+int mac_selinux_apply(const char *path, const char *label) {
+
+ assert(path);
+
+#if HAVE_SELINUX
+ int r;
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ assert(label);
+
+ if (setfilecon(path, label) < 0)
+ return log_enforcing_errno(errno, "Failed to set SELinux security context %s on path %s: %m", label, path);
+#endif
+ return 0;
+}
+
+int mac_selinux_apply_fd(int fd, const char *path, const char *label) {
+
+ assert(fd >= 0);
+
+#if HAVE_SELINUX
+ int r;
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ assert(label);
+
+ if (setfilecon(FORMAT_PROC_FD_PATH(fd), label) < 0)
+ return log_enforcing_errno(errno, "Failed to set SELinux security context %s on path %s: %m", label, strna(path));
+#endif
+ return 0;
+}
+
+int mac_selinux_get_create_label_from_exe(const char *exe, char **label) {
+#if HAVE_SELINUX
+ _cleanup_freecon_ char *mycon = NULL, *fcon = NULL;
+ security_class_t sclass;
+ int r;
+
+ assert(exe);
+ assert(label);
+
+ r = selinux_init(/* force= */ false);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EOPNOTSUPP;
+
+ if (getcon_raw(&mycon) < 0)
+ return -errno;
+ if (!mycon)
+ return -EOPNOTSUPP;
+
+ if (getfilecon_raw(exe, &fcon) < 0)
+ return -errno;
+ if (!fcon)
+ return -EOPNOTSUPP;
+
+ sclass = string_to_security_class("process");
+ if (sclass == 0)
+ return -ENOSYS;
+
+ return RET_NERRNO(security_compute_create_raw(mycon, fcon, sclass, label));
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+int mac_selinux_get_our_label(char **ret) {
+ assert(ret);
+
+#if HAVE_SELINUX
+ int r;
+
+ r = selinux_init(/* force= */ false);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EOPNOTSUPP;
+
+ _cleanup_freecon_ char *con = NULL;
+ if (getcon_raw(&con) < 0)
+ return -errno;
+ if (!con)
+ return -EOPNOTSUPP;
+
+ *ret = TAKE_PTR(con);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+int mac_selinux_get_child_mls_label(int socket_fd, const char *exe, const char *exec_label, char **ret_label) {
+#if HAVE_SELINUX
+ _cleanup_freecon_ char *mycon = NULL, *peercon = NULL, *fcon = NULL;
+ _cleanup_context_free_ context_t pcon = NULL, bcon = NULL;
+ const char *range = NULL, *bcon_str = NULL;
+ security_class_t sclass;
+ int r;
+
+ assert(socket_fd >= 0);
+ assert(exe);
+ assert(ret_label);
+
+ r = selinux_init(/* force= */ false);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EOPNOTSUPP;
+
+ if (getcon_raw(&mycon) < 0)
+ return -errno;
+ if (!mycon)
+ return -EOPNOTSUPP;
+
+ if (getpeercon_raw(socket_fd, &peercon) < 0)
+ return -errno;
+ if (!peercon)
+ return -EOPNOTSUPP;
+
+ if (!exec_label) { /* If there is no context set for next exec let's use context of target executable */
+ if (getfilecon_raw(exe, &fcon) < 0)
+ return -errno;
+ if (!fcon)
+ return -EOPNOTSUPP;
+ }
+
+ bcon = context_new(mycon);
+ if (!bcon)
+ return -ENOMEM;
+
+ pcon = context_new(peercon);
+ if (!pcon)
+ return -ENOMEM;
+
+ range = context_range_get(pcon);
+ if (!range)
+ return -errno;
+
+ if (context_range_set(bcon, range) != 0)
+ return -errno;
+
+ bcon_str = context_str(bcon);
+ if (!bcon_str)
+ return -ENOMEM;
+
+ sclass = string_to_security_class("process");
+ if (sclass == 0)
+ return -ENOSYS;
+
+ return RET_NERRNO(security_compute_create_raw(bcon_str, fcon, sclass, ret_label));
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+char* mac_selinux_free(char *label) {
+
+#if HAVE_SELINUX
+ freecon(label);
+#else
+ assert(!label);
+#endif
+
+ return NULL;
+}
+
+#if HAVE_SELINUX
+static int selinux_create_file_prepare_abspath(const char *abspath, mode_t mode) {
+ _cleanup_freecon_ char *filecon = NULL;
+ int r;
+
+ assert(abspath);
+ assert(path_is_absolute(abspath));
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ /* Check for policy reload so 'label_hnd' is kept up-to-date by callbacks */
+ mac_selinux_maybe_reload();
+ if (!label_hnd)
+ return 0;
+
+ r = selabel_lookup_raw(label_hnd, &filecon, abspath, mode);
+ if (r < 0) {
+ /* No context specified by the policy? Proceed without setting it. */
+ if (errno == ENOENT)
+ return 0;
+
+ return log_enforcing_errno(errno, "Failed to determine SELinux security context for %s: %m", abspath);
+ }
+
+ if (setfscreatecon_raw(filecon) < 0)
+ return log_enforcing_errno(errno, "Failed to set SELinux security context %s for %s: %m", filecon, abspath);
+
+ return 0;
+}
+#endif
+
+int mac_selinux_create_file_prepare_at(
+ int dir_fd,
+ const char *path,
+ mode_t mode) {
+
+#if HAVE_SELINUX
+ _cleanup_free_ char *abspath = NULL;
+ int r;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ if (!label_hnd)
+ return 0;
+
+ if (isempty(path) || !path_is_absolute(path)) {
+ if (dir_fd == AT_FDCWD)
+ r = safe_getcwd(&abspath);
+ else
+ r = fd_get_path(dir_fd, &abspath);
+ if (r < 0)
+ return r;
+
+ if (!isempty(path) && !path_extend(&abspath, path))
+ return -ENOMEM;
+
+ path = abspath;
+ }
+
+ return selinux_create_file_prepare_abspath(path, mode);
+#else
+ return 0;
+#endif
+}
+
+int mac_selinux_create_file_prepare_label(const char *path, const char *label) {
+#if HAVE_SELINUX
+ int r;
+
+ if (!label)
+ return 0;
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ if (setfscreatecon_raw(label) < 0)
+ return log_enforcing_errno(errno, "Failed to set specified SELinux security context '%s' for '%s': %m", label, strna(path));
+#endif
+ return 0;
+}
+
+void mac_selinux_create_file_clear(void) {
+
+#if HAVE_SELINUX
+ PROTECT_ERRNO;
+
+ if (selinux_init(/* force= */ false) <= 0)
+ return;
+
+ setfscreatecon_raw(NULL);
+#endif
+}
+
+int mac_selinux_create_socket_prepare(const char *label) {
+
+#if HAVE_SELINUX
+ int r;
+
+ assert(label);
+
+ r = selinux_init(/* force= */ false);
+ if (r <= 0)
+ return r;
+
+ if (setsockcreatecon(label) < 0)
+ return log_enforcing_errno(errno, "Failed to set SELinux security context %s for sockets: %m", label);
+#endif
+
+ return 0;
+}
+
+void mac_selinux_create_socket_clear(void) {
+
+#if HAVE_SELINUX
+ PROTECT_ERRNO;
+
+ if (selinux_init(/* force= */ false) <= 0)
+ return;
+
+ setsockcreatecon_raw(NULL);
+#endif
+}
+
+int mac_selinux_bind(int fd, const struct sockaddr *addr, socklen_t addrlen) {
+
+ /* Binds a socket and label its file system object according to the SELinux policy */
+
+#if HAVE_SELINUX
+ _cleanup_freecon_ char *fcon = NULL;
+ const struct sockaddr_un *un;
+ bool context_changed = false;
+ size_t sz;
+ char *path;
+ int r;
+
+ assert(fd >= 0);
+ assert(addr);
+ assert(addrlen >= sizeof(sa_family_t));
+
+ if (selinux_init(/* force= */ false) <= 0)
+ goto skipped;
+
+ if (!label_hnd)
+ goto skipped;
+
+ /* Filter out non-local sockets */
+ if (addr->sa_family != AF_UNIX)
+ goto skipped;
+
+ /* Filter out anonymous sockets */
+ if (addrlen < offsetof(struct sockaddr_un, sun_path) + 1)
+ goto skipped;
+
+ /* Filter out abstract namespace sockets */
+ un = (const struct sockaddr_un*) addr;
+ if (un->sun_path[0] == 0)
+ goto skipped;
+
+ sz = addrlen - offsetof(struct sockaddr_un, sun_path);
+ if (sz > PATH_MAX)
+ goto skipped;
+ path = strndupa_safe(un->sun_path, sz);
+
+ /* Check for policy reload so 'label_hnd' is kept up-to-date by callbacks */
+ mac_selinux_maybe_reload();
+ if (!label_hnd)
+ goto skipped;
+
+ if (path_is_absolute(path))
+ r = selabel_lookup_raw(label_hnd, &fcon, path, S_IFSOCK);
+ else {
+ _cleanup_free_ char *newpath = NULL;
+
+ r = path_make_absolute_cwd(path, &newpath);
+ if (r < 0)
+ return r;
+
+ r = selabel_lookup_raw(label_hnd, &fcon, newpath, S_IFSOCK);
+ }
+
+ if (r < 0) {
+ /* No context specified by the policy? Proceed without setting it */
+ if (errno == ENOENT)
+ goto skipped;
+
+ r = log_enforcing_errno(errno, "Failed to determine SELinux security context for %s: %m", path);
+ if (r < 0)
+ return r;
+ } else {
+ if (setfscreatecon_raw(fcon) < 0) {
+ r = log_enforcing_errno(errno, "Failed to set SELinux security context %s for %s: %m", fcon, path);
+ if (r < 0)
+ return r;
+ } else
+ context_changed = true;
+ }
+
+ r = RET_NERRNO(bind(fd, addr, addrlen));
+
+ if (context_changed)
+ (void) setfscreatecon_raw(NULL);
+
+ return r;
+
+skipped:
+#endif
+ return RET_NERRNO(bind(fd, addr, addrlen));
+}
diff --git a/src/shared/selinux-util.h b/src/shared/selinux-util.h
new file mode 100644
index 0000000..97ab5eb
--- /dev/null
+++ b/src/shared/selinux-util.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <stdbool.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include "macro.h"
+#include "label-util.h"
+
+#if HAVE_SELINUX
+#include <selinux/selinux.h>
+
+DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(char*, freecon, NULL);
+#define _cleanup_freecon_ _cleanup_(freeconp)
+#endif
+
+bool mac_selinux_use(void);
+void mac_selinux_retest(void);
+bool mac_selinux_enforcing(void);
+
+int mac_selinux_init(void);
+int mac_selinux_init_lazy(void);
+void mac_selinux_maybe_reload(void);
+void mac_selinux_finish(void);
+
+int mac_selinux_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags);
+
+int mac_selinux_apply(const char *path, const char *label);
+int mac_selinux_apply_fd(int fd, const char *path, const char *label);
+
+int mac_selinux_get_create_label_from_exe(const char *exe, char **label);
+int mac_selinux_get_our_label(char **label);
+int mac_selinux_get_child_mls_label(int socket_fd, const char *exe, const char *exec_label, char **label);
+char* mac_selinux_free(char *label);
+
+int mac_selinux_create_file_prepare_at(int dirfd, const char *path, mode_t mode);
+static inline int mac_selinux_create_file_prepare(const char *path, mode_t mode) {
+ return mac_selinux_create_file_prepare_at(AT_FDCWD, path, mode);
+}
+int mac_selinux_create_file_prepare_label(const char *path, const char *label);
+void mac_selinux_create_file_clear(void);
+
+int mac_selinux_create_socket_prepare(const char *label);
+void mac_selinux_create_socket_clear(void);
+
+int mac_selinux_bind(int fd, const struct sockaddr *addr, socklen_t addrlen);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(char*, mac_selinux_free);
diff --git a/src/shared/serialize.c b/src/shared/serialize.c
new file mode 100644
index 0000000..483cbc7
--- /dev/null
+++ b/src/shared/serialize.c
@@ -0,0 +1,552 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <fcntl.h>
+
+#include "alloc-util.h"
+#include "env-util.h"
+#include "escape.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hexdecoct.h"
+#include "memfd-util.h"
+#include "missing_mman.h"
+#include "missing_syscall.h"
+#include "parse-util.h"
+#include "process-util.h"
+#include "serialize.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+
+int serialize_item(FILE *f, const char *key, const char *value) {
+ assert(f);
+ assert(key);
+
+ if (!value)
+ return 0;
+
+ /* Make sure that anything we serialize we can also read back again with read_line() with a maximum line size
+ * of LONG_LINE_MAX. This is a safety net only. All code calling us should filter this out earlier anyway. */
+ if (strlen(key) + 1 + strlen(value) + 1 > LONG_LINE_MAX)
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Attempted to serialize overly long item '%s', refusing.", key);
+
+ fputs(key, f);
+ fputc('=', f);
+ fputs(value, f);
+ fputc('\n', f);
+
+ return 1;
+}
+
+int serialize_item_escaped(FILE *f, const char *key, const char *value) {
+ _cleanup_free_ char *c = NULL;
+
+ assert(f);
+ assert(key);
+
+ if (!value)
+ return 0;
+
+ c = xescape(value, " ");
+ if (!c)
+ return log_oom();
+
+ return serialize_item(f, key, c);
+}
+
+int serialize_item_format(FILE *f, const char *key, const char *format, ...) {
+ _cleanup_free_ char *allocated = NULL;
+ char buf[256]; /* Something reasonably short that fits nicely on any stack (i.e. is considerably less
+ * than LONG_LINE_MAX (1MiB!) */
+ const char *b;
+ va_list ap;
+ int k;
+
+ assert(f);
+ assert(key);
+ assert(format);
+
+ /* First, let's try to format this into a stack buffer */
+ va_start(ap, format);
+ k = vsnprintf(buf, sizeof(buf), format, ap);
+ va_end(ap);
+
+ if (k < 0)
+ return log_warning_errno(errno, "Failed to serialize item '%s', ignoring: %m", key);
+ if (strlen(key) + 1 + k + 1 > LONG_LINE_MAX) /* See above */
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL), "Attempted to serialize overly long item '%s', refusing.", key);
+
+ if ((size_t) k < sizeof(buf))
+ b = buf; /* Yay, it fit! */
+ else {
+ /* So the string didn't fit in the short buffer above, but was not above our total limit,
+ * hence let's format it via dynamic memory */
+
+ va_start(ap, format);
+ k = vasprintf(&allocated, format, ap);
+ va_end(ap);
+
+ if (k < 0)
+ return log_warning_errno(errno, "Failed to serialize item '%s', ignoring: %m", key);
+
+ b = allocated;
+ }
+
+ fputs(key, f);
+ fputc('=', f);
+ fputs(b, f);
+ fputc('\n', f);
+
+ return 1;
+}
+
+int serialize_fd(FILE *f, FDSet *fds, const char *key, int fd) {
+ int copy;
+
+ assert(f);
+ assert(fds);
+ assert(key);
+
+ if (fd < 0)
+ return 0;
+
+ copy = fdset_put_dup(fds, fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
+
+ return serialize_item_format(f, key, "%i", copy);
+}
+
+int serialize_fd_many(FILE *f, FDSet *fds, const char *key, const int fd_array[], size_t n_fd_array) {
+ _cleanup_free_ char *t = NULL;
+
+ assert(f);
+
+ if (n_fd_array == 0)
+ return 0;
+
+ assert(fd_array);
+
+ for (size_t i = 0; i < n_fd_array; i++) {
+ int copy;
+
+ if (fd_array[i] < 0)
+ return -EBADF;
+
+ copy = fdset_put_dup(fds, fd_array[i]);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
+
+ if (strextendf_with_separator(&t, " ", "%i", copy) < 0)
+ return log_oom();
+ }
+
+ return serialize_item(f, key, t);
+}
+
+int serialize_usec(FILE *f, const char *key, usec_t usec) {
+ assert(f);
+ assert(key);
+
+ if (usec == USEC_INFINITY)
+ return 0;
+
+ return serialize_item_format(f, key, USEC_FMT, usec);
+}
+
+int serialize_dual_timestamp(FILE *f, const char *name, const dual_timestamp *t) {
+ assert(f);
+ assert(name);
+ assert(t);
+
+ if (!dual_timestamp_is_set(t))
+ return 0;
+
+ return serialize_item_format(f, name, USEC_FMT " " USEC_FMT, t->realtime, t->monotonic);
+}
+
+int serialize_strv(FILE *f, const char *key, char **l) {
+ int ret = 0, r;
+
+ /* Returns the first error, or positive if anything was serialized, 0 otherwise. */
+
+ STRV_FOREACH(i, l) {
+ r = serialize_item_escaped(f, key, *i);
+ if ((ret >= 0 && r < 0) ||
+ (ret == 0 && r > 0))
+ ret = r;
+ }
+
+ return ret;
+}
+
+int serialize_pidref(FILE *f, FDSet *fds, const char *key, PidRef *pidref) {
+ int copy;
+
+ assert(f);
+ assert(fds);
+
+ if (!pidref_is_set(pidref))
+ return 0;
+
+ /* If we have a pidfd we serialize the fd and encode the fd number prefixed by "@" in the
+ * serialization. Otherwise we serialize the numeric PID as it is. */
+
+ if (pidref->fd < 0)
+ return serialize_item_format(f, key, PID_FMT, pidref->pid);
+
+ copy = fdset_put_dup(fds, pidref->fd);
+ if (copy < 0)
+ return log_error_errno(copy, "Failed to add file descriptor to serialization set: %m");
+
+ return serialize_item_format(f, key, "@%i", copy);
+}
+
+int serialize_ratelimit(FILE *f, const char *key, const RateLimit *rl) {
+ assert(rl);
+
+ return serialize_item_format(f, key,
+ USEC_FMT " " USEC_FMT " %u %u",
+ rl->begin,
+ rl->interval,
+ rl->num,
+ rl->burst);
+}
+
+int serialize_item_hexmem(FILE *f, const char *key, const void *p, size_t l) {
+ _cleanup_free_ char *encoded = NULL;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (!p && l > 0)
+ return -EINVAL;
+
+ if (l == 0)
+ return 0;
+
+ encoded = hexmem(p, l);
+ if (!encoded)
+ return log_oom_debug();
+
+ r = serialize_item(f, key, encoded);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int serialize_item_base64mem(FILE *f, const char *key, const void *p, size_t l) {
+ _cleanup_free_ char *encoded = NULL;
+ ssize_t len;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (!p && l > 0)
+ return -EINVAL;
+
+ if (l == 0)
+ return 0;
+
+ len = base64mem(p, l, &encoded);
+ if (len <= 0)
+ return log_oom_debug();
+
+ r = serialize_item(f, key, encoded);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int serialize_string_set(FILE *f, const char *key, Set *s) {
+ const char *e;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (set_isempty(s))
+ return 0;
+
+ /* Serialize as individual items, as each element might contain separators and escapes */
+
+ SET_FOREACH(e, s) {
+ r = serialize_item(f, key, e);
+ if (r < 0)
+ return r;
+ }
+
+ return 1;
+}
+
+int serialize_image_policy(FILE *f, const char *key, const ImagePolicy *p) {
+ _cleanup_free_ char *policy = NULL;
+ int r;
+
+ assert(f);
+ assert(key);
+
+ if (!p)
+ return 0;
+
+ r = image_policy_to_string(p, /* simplify= */ false, &policy);
+ if (r < 0)
+ return r;
+
+ r = serialize_item(f, key, policy);
+ if (r < 0)
+ return r;
+
+ return 1;
+}
+
+int deserialize_read_line(FILE *f, char **ret) {
+ _cleanup_free_ char *line = NULL;
+ int r;
+
+ assert(f);
+ assert(ret);
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read serialization line: %m");
+ if (r == 0) { /* eof */
+ *ret = NULL;
+ return 0;
+ }
+
+ if (isempty(line)) { /* End marker */
+ *ret = NULL;
+ return 0;
+ }
+
+ *ret = TAKE_PTR(line);
+ return 1;
+}
+
+int deserialize_fd(FDSet *fds, const char *value) {
+ _cleanup_close_ int our_fd = -EBADF;
+ int parsed_fd;
+
+ assert(value);
+
+ parsed_fd = parse_fd(value);
+ if (parsed_fd < 0)
+ return log_debug_errno(parsed_fd, "Failed to parse file descriptor serialization: %s", value);
+
+ our_fd = fdset_remove(fds, parsed_fd); /* Take possession of the fd */
+ if (our_fd < 0)
+ return log_debug_errno(our_fd, "Failed to acquire fd from serialization fds: %m");
+
+ return TAKE_FD(our_fd);
+}
+
+int deserialize_fd_many(FDSet *fds, const char *value, size_t n, int *ret) {
+ int r, *fd_array = NULL;
+ size_t m = 0;
+
+ assert(value);
+
+ fd_array = new(int, n);
+ if (!fd_array)
+ return -ENOMEM;
+
+ CLEANUP_ARRAY(fd_array, m, close_many_and_free);
+
+ for (;;) {
+ _cleanup_free_ char *w = NULL;
+ int fd;
+
+ r = extract_first_word(&value, &w, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ if (m < n) /* Too few */
+ return -EINVAL;
+
+ break;
+ }
+
+ if (m >= n) /* Too many */
+ return -EINVAL;
+
+ fd = deserialize_fd(fds, w);
+ if (fd < 0)
+ return fd;
+
+ fd_array[m++] = fd;
+ }
+
+ memcpy(ret, fd_array, m * sizeof(int));
+ fd_array = mfree(fd_array);
+
+ return 0;
+}
+
+int deserialize_strv(const char *value, char ***l) {
+ ssize_t unescaped_len;
+ char *unescaped;
+
+ assert(l);
+ assert(value);
+
+ unescaped_len = cunescape(value, 0, &unescaped);
+ if (unescaped_len < 0)
+ return unescaped_len;
+
+ return strv_consume(l, unescaped);
+}
+
+int deserialize_usec(const char *value, usec_t *ret) {
+ int r;
+
+ assert(value);
+ assert(ret);
+
+ r = safe_atou64(value, ret);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse usec value \"%s\": %m", value);
+
+ return 0;
+}
+
+int deserialize_dual_timestamp(const char *value, dual_timestamp *ret) {
+ uint64_t a, b;
+ int r, pos;
+
+ assert(value);
+ assert(ret);
+
+ pos = strspn(value, WHITESPACE);
+ if (value[pos] == '-')
+ return -EINVAL;
+ pos += strspn(value + pos, DIGITS);
+ pos += strspn(value + pos, WHITESPACE);
+ if (value[pos] == '-')
+ return -EINVAL;
+
+ r = sscanf(value, "%" PRIu64 "%" PRIu64 "%n", &a, &b, &pos);
+ if (r != 2)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to parse dual timestamp value \"%s\".",
+ value);
+
+ if (value[pos] != '\0')
+ /* trailing garbage */
+ return -EINVAL;
+
+ *ret = (dual_timestamp) {
+ .realtime = a,
+ .monotonic = b,
+ };
+
+ return 0;
+}
+
+int deserialize_environment(const char *value, char ***list) {
+ _cleanup_free_ char *unescaped = NULL;
+ ssize_t l;
+ int r;
+
+ assert(value);
+ assert(list);
+
+ /* Changes the *environment strv inline. */
+
+ l = cunescape(value, 0, &unescaped);
+ if (l < 0)
+ return log_error_errno(l, "Failed to unescape: %m");
+
+ r = strv_env_replace_consume(list, TAKE_PTR(unescaped));
+ if (r < 0)
+ return log_error_errno(r, "Failed to append environment variable: %m");
+
+ return 0;
+}
+
+int deserialize_pidref(FDSet *fds, const char *value, PidRef *ret) {
+ const char *e;
+ int r;
+
+ assert(value);
+ assert(ret);
+
+ e = startswith(value, "@");
+ if (e) {
+ int fd = deserialize_fd(fds, e);
+
+ if (fd < 0)
+ return fd;
+
+ r = pidref_set_pidfd_consume(ret, fd);
+ } else {
+ pid_t pid;
+
+ r = parse_pid(value, &pid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse PID: %s", value);
+
+ r = pidref_set_pid(ret, pid);
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to initialize pidref: %m");
+
+ return 0;
+}
+
+void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value) {
+ usec_t begin, interval;
+ unsigned num, burst;
+
+ assert(rl);
+ assert(name);
+ assert(value);
+
+ if (sscanf(value, USEC_FMT " " USEC_FMT " %u %u", &begin, &interval, &num, &burst) != 4)
+ return log_notice("Failed to parse %s, ignoring: %s", name, value);
+
+ /* Preserve the counter only if the configuration didn't change. */
+ rl->num = (interval == rl->interval && burst == rl->burst) ? num : 0;
+ rl->begin = begin;
+}
+
+int open_serialization_fd(const char *ident) {
+ int fd;
+
+ fd = memfd_create_wrapper(ident, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
+ if (fd < 0) {
+ const char *path;
+
+ path = getpid_cached() == 1 ? "/run/systemd" : "/tmp";
+ fd = open_tmpfile_unlinkable(path, O_RDWR|O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ log_debug("Serializing %s to %s.", ident, path);
+ } else
+ log_debug("Serializing %s to memfd.", ident);
+
+ return fd;
+}
+
+int open_serialization_file(const char *ident, FILE **ret) {
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_close_ int fd;
+
+ assert(ret);
+
+ fd = open_serialization_fd(ident);
+ if (fd < 0)
+ return fd;
+
+ f = take_fdopen(&fd, "w+");
+ if (!f)
+ return -errno;
+
+ *ret = TAKE_PTR(f);
+
+ return 0;
+}
diff --git a/src/shared/serialize.h b/src/shared/serialize.h
new file mode 100644
index 0000000..355eff9
--- /dev/null
+++ b/src/shared/serialize.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "fdset.h"
+#include "image-policy.h"
+#include "macro.h"
+#include "pidref.h"
+#include "ratelimit.h"
+#include "set.h"
+#include "string-util.h"
+#include "time-util.h"
+
+int serialize_item(FILE *f, const char *key, const char *value);
+int serialize_item_escaped(FILE *f, const char *key, const char *value);
+int serialize_item_format(FILE *f, const char *key, const char *value, ...) _printf_(3,4);
+int serialize_item_hexmem(FILE *f, const char *key, const void *p, size_t l);
+int serialize_item_base64mem(FILE *f, const char *key, const void *p, size_t l);
+int serialize_fd(FILE *f, FDSet *fds, const char *key, int fd);
+int serialize_fd_many(FILE *f, FDSet *fds, const char *key, const int fd_array[], size_t n_fd_array);
+int serialize_usec(FILE *f, const char *key, usec_t usec);
+int serialize_dual_timestamp(FILE *f, const char *key, const dual_timestamp *t);
+int serialize_strv(FILE *f, const char *key, char **l);
+int serialize_pidref(FILE *f, FDSet *fds, const char *key, PidRef *pidref);
+int serialize_ratelimit(FILE *f, const char *key, const RateLimit *rl);
+int serialize_string_set(FILE *f, const char *key, Set *s);
+int serialize_image_policy(FILE *f, const char *key, const ImagePolicy *p);
+
+static inline int serialize_bool(FILE *f, const char *key, bool b) {
+ return serialize_item(f, key, yes_no(b));
+}
+static inline int serialize_bool_elide(FILE *f, const char *key, bool b) {
+ return b ? serialize_item(f, key, yes_no(b)) : 0;
+}
+
+static inline int serialize_item_tristate(FILE *f, const char *key, int value) {
+ return value >= 0 ? serialize_item_format(f, key, "%i", value) : 0;
+}
+
+int deserialize_read_line(FILE *f, char **ret);
+
+int deserialize_fd(FDSet *fds, const char *value);
+int deserialize_fd_many(FDSet *fds, const char *value, size_t n, int *ret);
+int deserialize_usec(const char *value, usec_t *ret);
+int deserialize_dual_timestamp(const char *value, dual_timestamp *ret);
+int deserialize_environment(const char *value, char ***environment);
+int deserialize_strv(const char *value, char ***l);
+int deserialize_pidref(FDSet *fds, const char *value, PidRef *ret);
+void deserialize_ratelimit(RateLimit *rl, const char *name, const char *value);
+
+int open_serialization_fd(const char *ident);
+int open_serialization_file(const char *ident, FILE **ret);
diff --git a/src/shared/service-util.c b/src/shared/service-util.c
new file mode 100644
index 0000000..b0585ba
--- /dev/null
+++ b/src/shared/service-util.c
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <getopt.h>
+#include <stdio.h>
+
+#include "alloc-util.h"
+#include "build.h"
+#include "pretty-print.h"
+#include "service-util.h"
+#include "terminal-util.h"
+
+static int help(const char *program_path, const char *service, const char *description, bool bus_introspect) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ r = terminal_urlify_man(service, "8", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%s [OPTIONS...]\n\n"
+ "%s%s%s\n\n"
+ "This program takes no positional arguments.\n\n"
+ "%sOptions%s:\n"
+ " -h --help Show this help\n"
+ " --version Show package version\n"
+ " --bus-introspect=PATH Write D-Bus XML introspection data\n"
+ "\nSee the %s for details.\n"
+ , program_path
+ , ansi_highlight(), description, ansi_normal()
+ , ansi_underline(), ansi_normal()
+ , link
+ );
+
+ return 0; /* No further action */
+}
+
+int service_parse_argv(
+ const char *service,
+ const char *description,
+ const BusObjectImplementation* const* bus_objects,
+ int argc, char *argv[]) {
+
+ enum {
+ ARG_VERSION = 0x100,
+ ARG_BUS_INTROSPECT,
+ };
+
+ static const struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT },
+ {}
+ };
+
+ int c;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0)
+ switch (c) {
+
+ case 'h':
+ return help(argv[0], service, description, bus_objects);
+
+ case ARG_VERSION:
+ return version();
+
+ case ARG_BUS_INTROSPECT:
+ return bus_introspect_implementations(
+ stdout,
+ optarg,
+ bus_objects);
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (optind < argc)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "This program takes no arguments.");
+
+ return 1; /* Further action */
+}
diff --git a/src/shared/service-util.h b/src/shared/service-util.h
new file mode 100644
index 0000000..360341f
--- /dev/null
+++ b/src/shared/service-util.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "bus-object.h"
+
+int service_parse_argv(
+ const char *service,
+ const char *description,
+ const BusObjectImplementation* const* bus_objects,
+ int argc, char *argv[]);
diff --git a/src/shared/sleep-config.c b/src/shared/sleep-config.c
new file mode 100644
index 0000000..7282111
--- /dev/null
+++ b/src/shared/sleep-config.c
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "conf-parser.h"
+#include "constants.h"
+#include "device-util.h"
+#include "devnum-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "hibernate-util.h"
+#include "log.h"
+#include "macro.h"
+#include "path-util.h"
+#include "sleep-config.h"
+#include "stat-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "time-util.h"
+
+#define DEFAULT_SUSPEND_ESTIMATION_USEC (1 * USEC_PER_HOUR)
+
+static const char* const sleep_operation_table[_SLEEP_OPERATION_MAX] = {
+ [SLEEP_SUSPEND] = "suspend",
+ [SLEEP_HIBERNATE] = "hibernate",
+ [SLEEP_HYBRID_SLEEP] = "hybrid-sleep",
+ [SLEEP_SUSPEND_THEN_HIBERNATE] = "suspend-then-hibernate",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(sleep_operation, SleepOperation);
+
+static char* const* const sleep_default_state_table[_SLEEP_OPERATION_CONFIG_MAX] = {
+ [SLEEP_SUSPEND] = STRV_MAKE("mem", "standby", "freeze"),
+ [SLEEP_HIBERNATE] = STRV_MAKE("disk"),
+ [SLEEP_HYBRID_SLEEP] = STRV_MAKE("disk"),
+};
+
+static char* const* const sleep_default_mode_table[_SLEEP_OPERATION_CONFIG_MAX] = {
+ /* Not used by SLEEP_SUSPEND */
+ [SLEEP_HIBERNATE] = STRV_MAKE("platform", "shutdown"),
+ [SLEEP_HYBRID_SLEEP] = STRV_MAKE("suspend"),
+};
+
+SleepConfig* sleep_config_free(SleepConfig *sc) {
+ if (!sc)
+ return NULL;
+
+ for (SleepOperation i = 0; i < _SLEEP_OPERATION_CONFIG_MAX; i++) {
+ strv_free(sc->states[i]);
+ strv_free(sc->modes[i]);
+ }
+
+ return mfree(sc);
+}
+
+static int config_parse_sleep_mode(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **modes = NULL;
+ char ***sv = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ if (isempty(rvalue)) {
+ modes = strv_new(NULL);
+ if (!modes)
+ return log_oom();
+ } else {
+ r = strv_split_full(&modes, rvalue, NULL, EXTRACT_UNQUOTE|EXTRACT_RETAIN_ESCAPE);
+ if (r < 0)
+ return log_oom();
+ }
+
+ return free_and_replace(*sv, modes);
+}
+
+static void sleep_config_validate_state_and_mode(SleepConfig *sc) {
+ assert(sc);
+
+ /* So we should really not allow setting SuspendState= to 'disk', which means hibernation. We have
+ * SLEEP_HIBERNATE for proper hibernation support, which includes checks for resume support (through
+ * EFI variable or resume= kernel command line option). It's simply not sensible to call the suspend
+ * operation but eventually do an unsafe hibernation. */
+ if (strv_contains(sc->states[SLEEP_SUSPEND], "disk")) {
+ strv_remove(sc->states[SLEEP_SUSPEND], "disk");
+ log_warning("Sleep state 'disk' is not supported by operation %s, ignoring.",
+ sleep_operation_to_string(SLEEP_SUSPEND));
+ }
+ assert(!sc->modes[SLEEP_SUSPEND]);
+
+ /* People should use hybrid-sleep instead of setting HibernateMode=suspend. Warn about it but don't
+ * drop it in this case. */
+ if (strv_contains(sc->modes[SLEEP_HIBERNATE], "suspend"))
+ log_warning("Sleep mode 'suspend' should not be used by operation %s. Please use %s instead.",
+ sleep_operation_to_string(SLEEP_HIBERNATE), sleep_operation_to_string(SLEEP_HYBRID_SLEEP));
+}
+
+int parse_sleep_config(SleepConfig **ret) {
+ _cleanup_(sleep_config_freep) SleepConfig *sc = NULL;
+ int allow_suspend = -1, allow_hibernate = -1, allow_s2h = -1, allow_hybrid_sleep = -1;
+
+ assert(ret);
+
+ sc = new(SleepConfig, 1);
+ if (!sc)
+ return log_oom();
+
+ *sc = (SleepConfig) {
+ .hibernate_delay_usec = USEC_INFINITY,
+ };
+
+ const ConfigTableItem items[] = {
+ { "Sleep", "AllowSuspend", config_parse_tristate, 0, &allow_suspend },
+ { "Sleep", "AllowHibernation", config_parse_tristate, 0, &allow_hibernate },
+ { "Sleep", "AllowSuspendThenHibernate", config_parse_tristate, 0, &allow_s2h },
+ { "Sleep", "AllowHybridSleep", config_parse_tristate, 0, &allow_hybrid_sleep },
+
+ { "Sleep", "SuspendState", config_parse_strv, 0, sc->states + SLEEP_SUSPEND },
+ { "Sleep", "SuspendMode", config_parse_warn_compat, DISABLED_LEGACY, NULL },
+
+ { "Sleep", "HibernateState", config_parse_warn_compat, DISABLED_LEGACY, NULL },
+ { "Sleep", "HibernateMode", config_parse_sleep_mode, 0, sc->modes + SLEEP_HIBERNATE },
+
+ { "Sleep", "HybridSleepState", config_parse_warn_compat, DISABLED_LEGACY, NULL },
+ { "Sleep", "HybridSleepMode", config_parse_warn_compat, DISABLED_LEGACY, NULL },
+
+ { "Sleep", "HibernateDelaySec", config_parse_sec, 0, &sc->hibernate_delay_usec },
+ { "Sleep", "SuspendEstimationSec", config_parse_sec, 0, &sc->suspend_estimation_usec },
+ {}
+ };
+
+ (void) config_parse_config_file("sleep.conf", "Sleep\0",
+ config_item_table_lookup, items,
+ CONFIG_PARSE_WARN, NULL);
+
+ /* use default values unless set */
+ sc->allow[SLEEP_SUSPEND] = allow_suspend != 0;
+ sc->allow[SLEEP_HIBERNATE] = allow_hibernate != 0;
+ sc->allow[SLEEP_HYBRID_SLEEP] = allow_hybrid_sleep >= 0 ? allow_hybrid_sleep
+ : (allow_suspend != 0 && allow_hibernate != 0);
+ sc->allow[SLEEP_SUSPEND_THEN_HIBERNATE] = allow_s2h >= 0 ? allow_s2h
+ : (allow_suspend != 0 && allow_hibernate != 0);
+
+ for (SleepOperation i = 0; i < _SLEEP_OPERATION_CONFIG_MAX; i++) {
+ if (!sc->states[i] && sleep_default_state_table[i]) {
+ sc->states[i] = strv_copy(sleep_default_state_table[i]);
+ if (!sc->states[i])
+ return log_oom();
+ }
+
+ if (!sc->modes[i] && sleep_default_mode_table[i]) {
+ sc->modes[i] = strv_copy(sleep_default_mode_table[i]);
+ if (!sc->modes[i])
+ return log_oom();
+ }
+ }
+
+ if (sc->suspend_estimation_usec == 0)
+ sc->suspend_estimation_usec = DEFAULT_SUSPEND_ESTIMATION_USEC;
+
+ sleep_config_validate_state_and_mode(sc);
+
+ *ret = TAKE_PTR(sc);
+ return 0;
+}
+
+int sleep_state_supported(char **states) {
+ _cleanup_free_ char *supported_sysfs = NULL;
+ const char *found;
+ int r;
+
+ if (strv_isempty(states))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOMSG), "No sleep state configured.");
+
+ if (access("/sys/power/state", W_OK) < 0)
+ return log_debug_errno(errno, "/sys/power/state is not writable: %m");
+
+ r = read_one_line_file("/sys/power/state", &supported_sysfs);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read /sys/power/state: %m");
+
+ r = string_contains_word_strv(supported_sysfs, NULL, states, &found);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /sys/power/state: %m");
+ if (r > 0) {
+ log_debug("Sleep state '%s' is supported by kernel.", found);
+ return true;
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *joined = strv_join(states, " ");
+ log_debug("None of the configured sleep states are supported by kernel: %s", strnull(joined));
+ }
+ return false;
+}
+
+int sleep_mode_supported(char **modes) {
+ _cleanup_free_ char *supported_sysfs = NULL;
+ int r;
+
+ /* Unlike state, kernel has its own default choice if not configured */
+ if (strv_isempty(modes)) {
+ log_debug("No sleep mode configured, using kernel default.");
+ return true;
+ }
+
+ if (access("/sys/power/disk", W_OK) < 0)
+ return log_debug_errno(errno, "/sys/power/disk is not writable: %m");
+
+ r = read_one_line_file("/sys/power/disk", &supported_sysfs);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to read /sys/power/disk: %m");
+
+ for (const char *p = supported_sysfs;;) {
+ _cleanup_free_ char *word = NULL;
+ char *mode;
+ size_t l;
+
+ r = extract_first_word(&p, &word, NULL, 0);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse /sys/power/disk: %m");
+ if (r == 0)
+ break;
+
+ mode = word;
+ l = strlen(word);
+
+ if (mode[0] == '[' && mode[l - 1] == ']') {
+ mode[l - 1] = '\0';
+ mode++;
+ }
+
+ if (strv_contains(modes, mode)) {
+ log_debug("Disk sleep mode '%s' is supported by kernel.", mode);
+ return true;
+ }
+ }
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *joined = strv_join(modes, " ");
+ log_debug("None of the configured hibernation power modes are supported by kernel: %s", strnull(joined));
+ }
+ return false;
+}
+
+static int sleep_supported_internal(
+ const SleepConfig *sleep_config,
+ SleepOperation operation,
+ bool check_allowed,
+ SleepSupport *ret_support);
+
+static int s2h_supported(const SleepConfig *sleep_config, SleepSupport *ret_support) {
+
+ static const SleepOperation operations[] = {
+ SLEEP_SUSPEND,
+ SLEEP_HIBERNATE,
+ };
+
+ SleepSupport support;
+ int r;
+
+ assert(sleep_config);
+ assert(ret_support);
+
+ if (!clock_supported(CLOCK_BOOTTIME_ALARM)) {
+ log_debug("CLOCK_BOOTTIME_ALARM is not supported, can't perform %s.", sleep_operation_to_string(SLEEP_SUSPEND_THEN_HIBERNATE));
+ *ret_support = SLEEP_ALARM_NOT_SUPPORTED;
+ return false;
+ }
+
+ FOREACH_ARRAY(i, operations, ELEMENTSOF(operations)) {
+ r = sleep_supported_internal(sleep_config, *i, /* check_allowed = */ false, &support);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("Sleep operation %s is not supported, can't perform %s.",
+ sleep_operation_to_string(*i), sleep_operation_to_string(SLEEP_SUSPEND_THEN_HIBERNATE));
+ *ret_support = support;
+ return false;
+ }
+ }
+
+ assert(support == SLEEP_SUPPORTED);
+ *ret_support = support;
+
+ return true;
+}
+
+static int sleep_supported_internal(
+ const SleepConfig *sleep_config,
+ SleepOperation operation,
+ bool check_allowed,
+ SleepSupport *ret_support) {
+
+ int r;
+
+ assert(sleep_config);
+ assert(operation >= 0);
+ assert(operation < _SLEEP_OPERATION_MAX);
+ assert(ret_support);
+
+ if (check_allowed && !sleep_config->allow[operation]) {
+ log_debug("Sleep operation %s is disabled by configuration.", sleep_operation_to_string(operation));
+ *ret_support = SLEEP_DISABLED;
+ return false;
+ }
+
+ if (operation == SLEEP_SUSPEND_THEN_HIBERNATE)
+ return s2h_supported(sleep_config, ret_support);
+
+ assert(operation < _SLEEP_OPERATION_CONFIG_MAX);
+
+ r = sleep_state_supported(sleep_config->states[operation]);
+ if (r == -ENOMSG) {
+ *ret_support = SLEEP_NOT_CONFIGURED;
+ return false;
+ }
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ *ret_support = SLEEP_STATE_OR_MODE_NOT_SUPPORTED;
+ return false;
+ }
+
+ if (sleep_operation_is_hibernation(operation)) {
+ r = sleep_mode_supported(sleep_config->modes[operation]);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ *ret_support = SLEEP_STATE_OR_MODE_NOT_SUPPORTED;
+ return false;
+ }
+
+ r = hibernation_is_safe();
+ if (r == -ENOTRECOVERABLE) {
+ *ret_support = SLEEP_RESUME_NOT_SUPPORTED;
+ return false;
+ }
+ if (r == -ENOSPC) {
+ *ret_support = SLEEP_NOT_ENOUGH_SWAP_SPACE;
+ return false;
+ }
+ if (r < 0)
+ return r;
+ } else
+ assert(!sleep_config->modes[operation]);
+
+ *ret_support = SLEEP_SUPPORTED;
+ return true;
+}
+
+int sleep_supported_full(SleepOperation operation, SleepSupport *ret_support) {
+ _cleanup_(sleep_config_freep) SleepConfig *sleep_config = NULL;
+ SleepSupport support;
+ int r;
+
+ assert(operation >= 0);
+ assert(operation < _SLEEP_OPERATION_MAX);
+
+ r = parse_sleep_config(&sleep_config);
+ if (r < 0)
+ return r;
+
+ r = sleep_supported_internal(sleep_config, operation, /* check_allowed = */ true, &support);
+ if (r < 0)
+ return r;
+
+ assert((r > 0) == (support == SLEEP_SUPPORTED));
+
+ if (ret_support)
+ *ret_support = support;
+
+ return r;
+}
diff --git a/src/shared/sleep-config.h b/src/shared/sleep-config.h
new file mode 100644
index 0000000..bc5aeb9
--- /dev/null
+++ b/src/shared/sleep-config.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "time-util.h"
+
+typedef enum SleepOperation {
+ SLEEP_SUSPEND,
+ SLEEP_HIBERNATE,
+ SLEEP_HYBRID_SLEEP,
+ _SLEEP_OPERATION_CONFIG_MAX,
+ /* The operations above require configuration for mode and state. The ones below are "combined"
+ * operations that use config from those individual operations. */
+
+ SLEEP_SUSPEND_THEN_HIBERNATE,
+
+ _SLEEP_OPERATION_MAX,
+ _SLEEP_OPERATION_INVALID = -EINVAL,
+} SleepOperation;
+
+const char* sleep_operation_to_string(SleepOperation s) _const_;
+SleepOperation sleep_operation_from_string(const char *s) _pure_;
+
+static inline bool sleep_operation_is_hibernation(SleepOperation operation) {
+ return IN_SET(operation, SLEEP_HIBERNATE, SLEEP_HYBRID_SLEEP);
+}
+
+typedef struct SleepConfig {
+ bool allow[_SLEEP_OPERATION_MAX];
+
+ char **states[_SLEEP_OPERATION_CONFIG_MAX];
+ char **modes[_SLEEP_OPERATION_CONFIG_MAX]; /* Power mode after writing hibernation image */
+
+ usec_t hibernate_delay_usec;
+ usec_t suspend_estimation_usec;
+} SleepConfig;
+
+SleepConfig* sleep_config_free(SleepConfig *sc);
+DEFINE_TRIVIAL_CLEANUP_FUNC(SleepConfig*, sleep_config_free);
+
+int parse_sleep_config(SleepConfig **sleep_config);
+
+typedef enum SleepSupport {
+ SLEEP_SUPPORTED,
+ SLEEP_DISABLED, /* Disabled in SleepConfig.allow */
+ SLEEP_NOT_CONFIGURED, /* SleepConfig.states is not configured */
+ SLEEP_STATE_OR_MODE_NOT_SUPPORTED, /* SleepConfig.states/modes are not supported by kernel */
+ SLEEP_RESUME_NOT_SUPPORTED,
+ SLEEP_NOT_ENOUGH_SWAP_SPACE,
+ SLEEP_ALARM_NOT_SUPPORTED, /* CLOCK_BOOTTIME_ALARM is unsupported by kernel (only used by s2h) */
+} SleepSupport;
+
+int sleep_supported_full(SleepOperation operation, SleepSupport *ret_support);
+static inline int sleep_supported(SleepOperation operation) {
+ return sleep_supported_full(operation, NULL);
+}
+
+/* Only for test-sleep-config */
+int sleep_state_supported(char **states);
+int sleep_mode_supported(char **modes);
diff --git a/src/shared/smack-util.c b/src/shared/smack-util.c
new file mode 100644
index 0000000..1f88e72
--- /dev/null
+++ b/src/shared/smack-util.c
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+/***
+ Copyright © 2013 Intel Corporation
+
+ Author: Auke Kok <auke-jan.h.kok@intel.com>
+***/
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/xattr.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "label.h"
+#include "log.h"
+#include "macro.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "smack-util.h"
+#include "stdio-util.h"
+#include "string-table.h"
+#include "xattr-util.h"
+
+#if ENABLE_SMACK
+bool mac_smack_use(void) {
+ static int cached_use = -1;
+
+ if (cached_use < 0)
+ cached_use = access("/sys/fs/smackfs/", F_OK) >= 0;
+
+ return cached_use;
+}
+
+static const char* const smack_attr_table[_SMACK_ATTR_MAX] = {
+ [SMACK_ATTR_ACCESS] = "security.SMACK64",
+ [SMACK_ATTR_EXEC] = "security.SMACK64EXEC",
+ [SMACK_ATTR_MMAP] = "security.SMACK64MMAP",
+ [SMACK_ATTR_TRANSMUTE] = "security.SMACK64TRANSMUTE",
+ [SMACK_ATTR_IPIN] = "security.SMACK64IPIN",
+ [SMACK_ATTR_IPOUT] = "security.SMACK64IPOUT",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(smack_attr, SmackAttr);
+
+int mac_smack_read(const char *path, SmackAttr attr, char **label) {
+ assert(path);
+ assert(attr >= 0 && attr < _SMACK_ATTR_MAX);
+ assert(label);
+
+ if (!mac_smack_use())
+ return 0;
+
+ return getxattr_malloc(path, smack_attr_to_string(attr), label);
+}
+
+int mac_smack_read_fd(int fd, SmackAttr attr, char **label) {
+ assert(fd >= 0);
+ assert(attr >= 0 && attr < _SMACK_ATTR_MAX);
+ assert(label);
+
+ if (!mac_smack_use())
+ return 0;
+
+ return fgetxattr_malloc(fd, smack_attr_to_string(attr), label);
+}
+
+int mac_smack_apply_at(int dir_fd, const char *path, SmackAttr attr, const char *label) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(path);
+ assert(attr >= 0 && attr < _SMACK_ATTR_MAX);
+
+ if (!mac_smack_use())
+ return 0;
+
+ fd = openat(dir_fd, path, O_PATH|O_CLOEXEC|O_NOFOLLOW);
+ if (fd < 0)
+ return -errno;
+
+ return mac_smack_apply_fd(fd, attr, label);
+}
+
+int mac_smack_apply_fd(int fd, SmackAttr attr, const char *label) {
+ int r;
+
+ assert(fd >= 0);
+ assert(attr >= 0 && attr < _SMACK_ATTR_MAX);
+
+ if (!mac_smack_use())
+ return 0;
+
+ if (label)
+ r = setxattr(FORMAT_PROC_FD_PATH(fd), smack_attr_to_string(attr), label, strlen(label), 0);
+ else
+ r = removexattr(FORMAT_PROC_FD_PATH(fd), smack_attr_to_string(attr));
+ if (r < 0)
+ return -errno;
+
+ return 0;
+}
+
+int mac_smack_apply_pid(pid_t pid, const char *label) {
+ const char *p;
+ int r;
+
+ assert(label);
+
+ if (!mac_smack_use())
+ return 0;
+
+ p = procfs_file_alloca(pid, "attr/current");
+ r = write_string_file(p, label, WRITE_STRING_FILE_DISABLE_BUFFER);
+ if (r < 0)
+ return r;
+
+ return r;
+}
+
+static int smack_fix_fd(
+ int fd,
+ const char *label_path,
+ LabelFixFlags flags) {
+
+ const char *label;
+ struct stat st;
+ int r;
+
+ /* The caller should have done the sanity checks. */
+ assert(fd >= 0);
+ assert(label_path);
+ assert(path_is_absolute(label_path));
+
+ /* Path must be in /dev. */
+ if (!path_startswith(label_path, "/dev"))
+ return 0;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ /*
+ * Label directories and character devices "*".
+ * Label symlinks "_".
+ * Don't change anything else.
+ */
+
+ if (S_ISDIR(st.st_mode))
+ label = SMACK_STAR_LABEL;
+ else if (S_ISLNK(st.st_mode))
+ label = SMACK_FLOOR_LABEL;
+ else if (S_ISCHR(st.st_mode))
+ label = SMACK_STAR_LABEL;
+ else
+ return 0;
+
+ if (setxattr(FORMAT_PROC_FD_PATH(fd), "security.SMACK64", label, strlen(label), 0) < 0) {
+ _cleanup_free_ char *old_label = NULL;
+
+ r = -errno;
+
+ /* If the FS doesn't support labels, then exit without warning */
+ if (ERRNO_IS_NOT_SUPPORTED(r))
+ return 0;
+
+ /* It the FS is read-only and we were told to ignore failures caused by that, suppress error */
+ if (r == -EROFS && (flags & LABEL_IGNORE_EROFS))
+ return 0;
+
+ /* If the old label is identical to the new one, suppress any kind of error */
+ if (lgetxattr_malloc(FORMAT_PROC_FD_PATH(fd), "security.SMACK64", &old_label) >= 0 &&
+ streq(old_label, label))
+ return 0;
+
+ return log_debug_errno(r, "Unable to fix SMACK label of %s: %m", label_path);
+ }
+
+ return 0;
+}
+
+int mac_smack_fix_full(
+ int atfd,
+ const char *inode_path,
+ const char *label_path,
+ LabelFixFlags flags) {
+
+ _cleanup_close_ int opened_fd = -EBADF;
+ _cleanup_free_ char *p = NULL;
+ int r, inode_fd;
+
+ assert(atfd >= 0 || atfd == AT_FDCWD);
+ assert(atfd >= 0 || inode_path);
+
+ if (!mac_smack_use())
+ return 0;
+
+ if (inode_path) {
+ opened_fd = openat(atfd, inode_path, O_NOFOLLOW|O_CLOEXEC|O_PATH);
+ if (opened_fd < 0) {
+ if ((flags & LABEL_IGNORE_ENOENT) && errno == ENOENT)
+ return 0;
+
+ return -errno;
+ }
+ inode_fd = opened_fd;
+ } else
+ inode_fd = atfd;
+
+ if (!label_path) {
+ if (path_is_absolute(inode_path))
+ label_path = inode_path;
+ else {
+ r = fd_get_path(inode_fd, &p);
+ if (r < 0)
+ return r;
+
+ label_path = p;
+ }
+ }
+
+ return smack_fix_fd(inode_fd, label_path, flags);
+}
+
+int mac_smack_copy(const char *dest, const char *src) {
+ int r;
+ _cleanup_free_ char *label = NULL;
+
+ assert(dest);
+ assert(src);
+
+ r = mac_smack_read(src, SMACK_ATTR_ACCESS, &label);
+ if (r < 0)
+ return r;
+
+ r = mac_smack_apply(dest, SMACK_ATTR_ACCESS, label);
+ if (r < 0)
+ return r;
+
+ return r;
+}
+
+#else
+bool mac_smack_use(void) {
+ return false;
+}
+
+int mac_smack_read(const char *path, SmackAttr attr, char **label) {
+ return -EOPNOTSUPP;
+}
+
+int mac_smack_read_fd(int fd, SmackAttr attr, char **label) {
+ return -EOPNOTSUPP;
+}
+
+int mac_smack_apply_at(int dir_fd, const char *path, SmackAttr attr, const char *label) {
+ return 0;
+}
+
+int mac_smack_apply_fd(int fd, SmackAttr attr, const char *label) {
+ return 0;
+}
+
+int mac_smack_apply_pid(pid_t pid, const char *label) {
+ return 0;
+}
+
+int mac_smack_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags) {
+ return 0;
+}
+
+int mac_smack_copy(const char *dest, const char *src) {
+ return 0;
+}
+#endif
+
+int renameat_and_apply_smack_floor_label(int fdf, const char *from, int fdt, const char *to) {
+
+ assert(fdf >= 0 || fdf == AT_FDCWD);
+ assert(fdt >= 0 || fdt == AT_FDCWD);
+
+ if (renameat(fdf, from, fdt, to) < 0)
+ return -errno;
+
+#if HAVE_SMACK_RUN_LABEL
+ return mac_smack_apply_at(fdt, to, SMACK_ATTR_ACCESS, SMACK_FLOOR_LABEL);
+#else
+ return 0;
+#endif
+}
+
+static int mac_smack_label_pre(int dir_fd, const char *path, mode_t mode) {
+ return 0;
+}
+
+static int mac_smack_label_post(int dir_fd, const char *path) {
+ return mac_smack_fix_full(dir_fd, path, NULL, 0);
+}
+
+int mac_smack_init(void) {
+ static const LabelOps label_ops = {
+ .pre = mac_smack_label_pre,
+ .post = mac_smack_label_post,
+ };
+
+ if (!mac_smack_use())
+ return 0;
+
+ return label_ops_set(&label_ops);
+}
diff --git a/src/shared/smack-util.h b/src/shared/smack-util.h
new file mode 100644
index 0000000..f6ed2ec
--- /dev/null
+++ b/src/shared/smack-util.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+/***
+ Copyright © 2013 Intel Corporation
+
+ Author: Auke Kok <auke-jan.h.kok@intel.com>
+***/
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "label-util.h"
+#include "macro.h"
+
+#define SMACK_FLOOR_LABEL "_"
+#define SMACK_STAR_LABEL "*"
+
+typedef enum SmackAttr {
+ SMACK_ATTR_ACCESS,
+ SMACK_ATTR_EXEC,
+ SMACK_ATTR_MMAP,
+ SMACK_ATTR_TRANSMUTE,
+ SMACK_ATTR_IPIN,
+ SMACK_ATTR_IPOUT,
+ _SMACK_ATTR_MAX,
+ _SMACK_ATTR_INVALID = -EINVAL,
+} SmackAttr;
+
+bool mac_smack_use(void);
+int mac_smack_init(void);
+
+int mac_smack_fix_full(int atfd, const char *inode_path, const char *label_path, LabelFixFlags flags);
+static inline int mac_smack_fix(const char *path, LabelFixFlags flags) {
+ return mac_smack_fix_full(AT_FDCWD, path, path, flags);
+}
+
+const char* smack_attr_to_string(SmackAttr i) _const_;
+SmackAttr smack_attr_from_string(const char *s) _pure_;
+int mac_smack_read(const char *path, SmackAttr attr, char **label);
+int mac_smack_read_fd(int fd, SmackAttr attr, char **label);
+int mac_smack_apply_at(int dir_fd, const char *path, SmackAttr attr, const char *label);
+static inline int mac_smack_apply(const char *path, SmackAttr attr, const char *label) {
+ return mac_smack_apply_at(AT_FDCWD, path, attr, label);
+}
+int mac_smack_apply_fd(int fd, SmackAttr attr, const char *label);
+int mac_smack_apply_pid(pid_t pid, const char *label);
+int mac_smack_copy(const char *dest, const char *src);
+
+int renameat_and_apply_smack_floor_label(int fdf, const char *from, int fdt, const char *to);
+static inline int rename_and_apply_smack_floor_label(const char *from, const char *to) {
+ return renameat_and_apply_smack_floor_label(AT_FDCWD, from, AT_FDCWD, to);
+}
diff --git a/src/shared/socket-label.c b/src/shared/socket-label.c
new file mode 100644
index 0000000..b86a6ad
--- /dev/null
+++ b/src/shared/socket-label.c
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "macro.h"
+#include "missing_socket.h"
+#include "mkdir-label.h"
+#include "selinux-util.h"
+#include "socket-util.h"
+#include "umask-util.h"
+
+int socket_address_listen(
+ const SocketAddress *a,
+ int flags,
+ int backlog,
+ SocketAddressBindIPv6Only only,
+ const char *bind_to_device,
+ bool reuse_port,
+ bool free_bind,
+ bool transparent,
+ mode_t directory_mode,
+ mode_t socket_mode,
+ const char *label) {
+
+ _cleanup_close_ int fd = -EBADF;
+ const char *p;
+ int r;
+
+ assert(a);
+
+ r = socket_address_verify(a, true);
+ if (r < 0)
+ return r;
+
+ if (socket_address_family(a) == AF_INET6 && !socket_ipv6_is_supported())
+ return -EAFNOSUPPORT;
+
+ if (label) {
+ r = mac_selinux_create_socket_prepare(label);
+ if (r < 0)
+ return r;
+ }
+
+ fd = RET_NERRNO(socket(socket_address_family(a), a->type | flags, a->protocol));
+
+ if (label)
+ mac_selinux_create_socket_clear();
+
+ if (fd < 0)
+ return fd;
+
+ if (socket_address_family(a) == AF_INET6 && only != SOCKET_ADDRESS_DEFAULT) {
+ r = setsockopt_int(fd, IPPROTO_IPV6, IPV6_V6ONLY, only == SOCKET_ADDRESS_IPV6_ONLY);
+ if (r < 0)
+ return r;
+ }
+
+ if (IN_SET(socket_address_family(a), AF_INET, AF_INET6)) {
+ if (bind_to_device) {
+ r = socket_bind_to_ifname(fd, bind_to_device);
+ if (r < 0)
+ return r;
+ }
+
+ if (reuse_port) {
+ r = setsockopt_int(fd, SOL_SOCKET, SO_REUSEPORT, true);
+ if (r < 0)
+ log_warning_errno(r, "SO_REUSEPORT failed: %m");
+ }
+
+ if (free_bind) {
+ r = socket_set_freebind(fd, socket_address_family(a), true);
+ if (r < 0)
+ log_warning_errno(r, "IP_FREEBIND/IPV6_FREEBIND failed: %m");
+ }
+
+ if (transparent) {
+ r = socket_set_transparent(fd, socket_address_family(a), true);
+ if (r < 0)
+ log_warning_errno(r, "IP_TRANSPARENT/IPV6_TRANSPARENT failed: %m");
+ }
+ }
+
+ r = setsockopt_int(fd, SOL_SOCKET, SO_REUSEADDR, true);
+ if (r < 0)
+ return r;
+
+ p = socket_address_get_path(a);
+ if (p) {
+ /* Create parents */
+ (void) mkdir_parents_label(p, directory_mode);
+
+ /* Enforce the right access mode for the socket */
+ WITH_UMASK(~socket_mode) {
+ r = mac_selinux_bind(fd, &a->sockaddr.sa, a->size);
+ if (r == -EADDRINUSE) {
+ /* Unlink and try again */
+
+ if (unlink(p) < 0)
+ return r; /* didn't work, return original error */
+
+ r = mac_selinux_bind(fd, &a->sockaddr.sa, a->size);
+ }
+ if (r < 0)
+ return r;
+ }
+ } else {
+ if (bind(fd, &a->sockaddr.sa, a->size) < 0)
+ return -errno;
+ }
+
+ if (socket_address_can_accept(a))
+ if (listen(fd, backlog) < 0)
+ return -errno;
+
+ /* Let's trigger an inotify event on the socket node, so that anyone waiting for this socket to be connectable
+ * gets notified */
+ if (p)
+ (void) touch(p);
+
+ return TAKE_FD(fd);
+}
diff --git a/src/shared/socket-netlink.c b/src/shared/socket-netlink.c
new file mode 100644
index 0000000..0ba5762
--- /dev/null
+++ b/src/shared/socket-netlink.c
@@ -0,0 +1,409 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <net/if.h>
+#include <string.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "extract-word.h"
+#include "log.h"
+#include "memory-util.h"
+#include "netlink-util.h"
+#include "parse-util.h"
+#include "socket-netlink.h"
+#include "socket-util.h"
+#include "string-util.h"
+
+int socket_address_parse(SocketAddress *a, const char *s) {
+ uint16_t port;
+ int r;
+
+ assert(a);
+ assert(s);
+
+ r = socket_address_parse_unix(a, s);
+ if (r == -EPROTO)
+ r = socket_address_parse_vsock(a, s);
+ if (r != -EPROTO)
+ return r;
+
+ r = parse_ip_port(s, &port);
+ if (r == -ERANGE)
+ return r; /* Valid port syntax, but the numerical value is wrong for a port. */
+ if (r >= 0) {
+ /* Just a port */
+ if (socket_ipv6_is_supported())
+ *a = (SocketAddress) {
+ .sockaddr.in6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htobe16(port),
+ .sin6_addr = in6addr_any,
+ },
+ .size = sizeof(struct sockaddr_in6),
+ };
+ else
+ *a = (SocketAddress) {
+ .sockaddr.in = {
+ .sin_family = AF_INET,
+ .sin_port = htobe16(port),
+ .sin_addr.s_addr = INADDR_ANY,
+ },
+ .size = sizeof(struct sockaddr_in),
+ };
+
+ } else {
+ union in_addr_union address;
+ int family, ifindex;
+
+ r = in_addr_port_ifindex_name_from_string_auto(s, &family, &address, &port, &ifindex, NULL);
+ if (r < 0)
+ return r;
+
+ if (port == 0) /* No port, no go. */
+ return -EINVAL;
+
+ if (family == AF_INET)
+ *a = (SocketAddress) {
+ .sockaddr.in = {
+ .sin_family = AF_INET,
+ .sin_addr = address.in,
+ .sin_port = htobe16(port),
+ },
+ .size = sizeof(struct sockaddr_in),
+ };
+ else if (family == AF_INET6)
+ *a = (SocketAddress) {
+ .sockaddr.in6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = address.in6,
+ .sin6_port = htobe16(port),
+ .sin6_scope_id = ifindex,
+ },
+ .size = sizeof(struct sockaddr_in6),
+ };
+ else
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+int socket_address_parse_and_warn(SocketAddress *a, const char *s) {
+ SocketAddress b;
+ int r;
+
+ /* Similar to socket_address_parse() but warns for IPv6 sockets when we don't support them. */
+
+ r = socket_address_parse(&b, s);
+ if (r < 0)
+ return r;
+
+ if (!socket_ipv6_is_supported() && b.sockaddr.sa.sa_family == AF_INET6) {
+ log_warning("Binding to IPv6 address not available since kernel does not support IPv6.");
+ return -EAFNOSUPPORT;
+ }
+
+ *a = b;
+ return 0;
+}
+
+int socket_address_parse_netlink(SocketAddress *a, const char *s) {
+ _cleanup_free_ char *word = NULL;
+ unsigned group = 0;
+ int family, r;
+
+ assert(a);
+ assert(s);
+
+ r = extract_first_word(&s, &word, NULL, 0);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EINVAL;
+
+ family = netlink_family_from_string(word);
+ if (family < 0)
+ return -EINVAL;
+
+ if (!isempty(s)) {
+ r = safe_atou(s, &group);
+ if (r < 0)
+ return r;
+ }
+
+ *a = (SocketAddress) {
+ .type = SOCK_RAW,
+ .sockaddr.nl.nl_family = AF_NETLINK,
+ .sockaddr.nl.nl_groups = group,
+ .protocol = family,
+ .size = sizeof(struct sockaddr_nl),
+ };
+
+ return 0;
+}
+
+bool socket_address_is(const SocketAddress *a, const char *s, int type) {
+ struct SocketAddress b;
+
+ assert(a);
+ assert(s);
+
+ if (socket_address_parse(&b, s) < 0)
+ return false;
+
+ b.type = type;
+
+ return socket_address_equal(a, &b);
+}
+
+bool socket_address_is_netlink(const SocketAddress *a, const char *s) {
+ struct SocketAddress b;
+
+ assert(a);
+ assert(s);
+
+ if (socket_address_parse_netlink(&b, s) < 0)
+ return false;
+
+ return socket_address_equal(a, &b);
+}
+
+int make_socket_fd(int log_level, const char* address, int type, int flags) {
+ SocketAddress a;
+ int fd, r;
+
+ r = socket_address_parse(&a, address);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse socket address \"%s\": %m", address);
+
+ a.type = type;
+
+ fd = socket_address_listen(&a, type | flags, SOMAXCONN_DELUXE, SOCKET_ADDRESS_DEFAULT,
+ NULL, false, false, false, 0755, 0644, NULL);
+ if (fd < 0 || log_get_max_level() >= log_level) {
+ _cleanup_free_ char *p = NULL;
+
+ r = socket_address_print(&a, &p);
+ if (r < 0)
+ return log_error_errno(r, "socket_address_print(): %m");
+
+ if (fd < 0)
+ log_error_errno(fd, "Failed to listen on %s: %m", p);
+ else
+ log_full(log_level, "Listening on %s", p);
+ }
+
+ return fd;
+}
+
+int in_addr_port_ifindex_name_from_string_auto(
+ const char *s,
+ int *ret_family,
+ union in_addr_union *ret_address,
+ uint16_t *ret_port,
+ int *ret_ifindex,
+ char **ret_server_name) {
+
+ _cleanup_free_ char *buf1 = NULL, *buf2 = NULL, *name = NULL;
+ int family, ifindex = 0, r;
+ union in_addr_union a;
+ uint16_t port = 0;
+ const char *m;
+
+ assert(s);
+
+ /* This accepts the following:
+ * 192.168.0.1:53#example.com
+ * [2001:4860:4860::8888]:53%eth0#example.com
+ *
+ * If ret_port is NULL, then the port cannot be specified.
+ * If ret_ifindex is NULL, then the interface index cannot be specified.
+ * If ret_server_name is NULL, then server_name cannot be specified.
+ *
+ * ret_family is always AF_INET or AF_INET6.
+ */
+
+ m = strchr(s, '#');
+ if (m) {
+ if (!ret_server_name)
+ return -EINVAL;
+
+ if (isempty(m + 1))
+ return -EINVAL;
+
+ name = strdup(m + 1);
+ if (!name)
+ return -ENOMEM;
+
+ s = buf1 = strndup(s, m - s);
+ if (!buf1)
+ return -ENOMEM;
+ }
+
+ m = strchr(s, '%');
+ if (m) {
+ if (!ret_ifindex)
+ return -EINVAL;
+
+ if (isempty(m + 1))
+ return -EINVAL;
+
+ if (!ifname_valid_full(m + 1, IFNAME_VALID_ALTERNATIVE | IFNAME_VALID_NUMERIC))
+ return -EINVAL; /* We want to return -EINVAL for syntactically invalid names,
+ * and -ENODEV for valid but nonexistent interfaces. */
+
+ ifindex = rtnl_resolve_interface(NULL, m + 1);
+ if (ifindex < 0)
+ return ifindex;
+
+ s = buf2 = strndup(s, m - s);
+ if (!buf2)
+ return -ENOMEM;
+ }
+
+ m = strrchr(s, ':');
+ if (m) {
+ if (*s == '[') {
+ _cleanup_free_ char *ip_str = NULL;
+
+ if (!ret_port)
+ return -EINVAL;
+
+ if (*(m - 1) != ']')
+ return -EINVAL;
+
+ family = AF_INET6;
+
+ r = parse_ip_port(m + 1, &port);
+ if (r < 0)
+ return r;
+
+ ip_str = strndup(s + 1, m - s - 2);
+ if (!ip_str)
+ return -ENOMEM;
+
+ r = in_addr_from_string(family, ip_str, &a);
+ if (r < 0)
+ return r;
+ } else {
+ /* First try to parse the string as IPv6 address without port number */
+ r = in_addr_from_string(AF_INET6, s, &a);
+ if (r < 0) {
+ /* Then the input should be IPv4 address with port number */
+ _cleanup_free_ char *ip_str = NULL;
+
+ if (!ret_port)
+ return -EINVAL;
+
+ family = AF_INET;
+
+ ip_str = strndup(s, m - s);
+ if (!ip_str)
+ return -ENOMEM;
+
+ r = in_addr_from_string(family, ip_str, &a);
+ if (r < 0)
+ return r;
+
+ r = parse_ip_port(m + 1, &port);
+ if (r < 0)
+ return r;
+ } else
+ family = AF_INET6;
+ }
+ } else {
+ family = AF_INET;
+ r = in_addr_from_string(family, s, &a);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_family)
+ *ret_family = family;
+ if (ret_address)
+ *ret_address = a;
+ if (ret_port)
+ *ret_port = port;
+ if (ret_ifindex)
+ *ret_ifindex = ifindex;
+ if (ret_server_name)
+ *ret_server_name = TAKE_PTR(name);
+
+ return r;
+}
+
+struct in_addr_full *in_addr_full_free(struct in_addr_full *a) {
+ if (!a)
+ return NULL;
+
+ free(a->server_name);
+ free(a->cached_server_string);
+ return mfree(a);
+}
+
+int in_addr_full_new(
+ int family,
+ const union in_addr_union *a,
+ uint16_t port,
+ int ifindex,
+ const char *server_name,
+ struct in_addr_full **ret) {
+
+ _cleanup_free_ char *name = NULL;
+ struct in_addr_full *x;
+
+ assert(ret);
+
+ if (!isempty(server_name)) {
+ name = strdup(server_name);
+ if (!name)
+ return -ENOMEM;
+ }
+
+ x = new(struct in_addr_full, 1);
+ if (!x)
+ return -ENOMEM;
+
+ *x = (struct in_addr_full) {
+ .family = family,
+ .address = *a,
+ .port = port,
+ .ifindex = ifindex,
+ .server_name = TAKE_PTR(name),
+ };
+
+ *ret = x;
+ return 0;
+}
+
+int in_addr_full_new_from_string(const char *s, struct in_addr_full **ret) {
+ _cleanup_free_ char *server_name = NULL;
+ int family, ifindex, r;
+ union in_addr_union a;
+ uint16_t port;
+
+ assert(s);
+
+ r = in_addr_port_ifindex_name_from_string_auto(s, &family, &a, &port, &ifindex, &server_name);
+ if (r < 0)
+ return r;
+
+ return in_addr_full_new(family, &a, port, ifindex, server_name, ret);
+}
+
+const char *in_addr_full_to_string(struct in_addr_full *a) {
+ assert(a);
+
+ if (!a->cached_server_string)
+ (void) in_addr_port_ifindex_name_to_string(
+ a->family,
+ &a->address,
+ a->port,
+ a->ifindex,
+ a->server_name,
+ &a->cached_server_string);
+
+ return a->cached_server_string;
+}
diff --git a/src/shared/socket-netlink.h b/src/shared/socket-netlink.h
new file mode 100644
index 0000000..6256a83
--- /dev/null
+++ b/src/shared/socket-netlink.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "in-addr-util.h"
+#include "macro.h"
+#include "socket-util.h"
+
+int make_socket_fd(int log_level, const char* address, int type, int flags);
+
+int socket_address_parse(SocketAddress *a, const char *s);
+int socket_address_parse_and_warn(SocketAddress *a, const char *s);
+int socket_address_parse_netlink(SocketAddress *a, const char *s);
+
+bool socket_address_is(const SocketAddress *a, const char *s, int type);
+bool socket_address_is_netlink(const SocketAddress *a, const char *s);
+
+int in_addr_port_ifindex_name_from_string_auto(
+ const char *s,
+ int *ret_family,
+ union in_addr_union *ret_address,
+ uint16_t *ret_port,
+ int *ret_ifindex,
+ char **ret_server_name);
+static inline int in_addr_ifindex_name_from_string_auto(const char *s, int *family, union in_addr_union *ret, int *ifindex, char **server_name) {
+ return in_addr_port_ifindex_name_from_string_auto(s, family, ret, NULL, ifindex, server_name);
+}
+static inline int in_addr_ifindex_from_string_auto(const char *s, int *family, union in_addr_union *ret, int *ifindex) {
+ return in_addr_ifindex_name_from_string_auto(s, family, ret, ifindex, NULL);
+}
+
+struct in_addr_full {
+ int family;
+ union in_addr_union address;
+ uint16_t port;
+ int ifindex;
+ char *server_name;
+ char *cached_server_string; /* Should not be handled directly, but through in_addr_full_to_string(). */
+};
+
+struct in_addr_full *in_addr_full_free(struct in_addr_full *a);
+DEFINE_TRIVIAL_CLEANUP_FUNC(struct in_addr_full*, in_addr_full_free);
+int in_addr_full_new(int family, const union in_addr_union *a, uint16_t port, int ifindex, const char *server_name, struct in_addr_full **ret);
+int in_addr_full_new_from_string(const char *s, struct in_addr_full **ret);
+const char *in_addr_full_to_string(struct in_addr_full *a);
diff --git a/src/shared/spawn-ask-password-agent.c b/src/shared/spawn-ask-password-agent.c
new file mode 100644
index 0000000..d34cfff
--- /dev/null
+++ b/src/shared/spawn-ask-password-agent.c
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "exec-util.h"
+#include "log.h"
+#include "process-util.h"
+#include "spawn-ask-password-agent.h"
+
+static pid_t agent_pid = 0;
+
+int ask_password_agent_open(void) {
+ int r;
+
+ if (agent_pid > 0)
+ return 0;
+
+ /* We check STDIN here, not STDOUT, since this is about input,
+ * not output */
+ if (!isatty(STDIN_FILENO))
+ return 0;
+
+ if (!is_main_thread())
+ return -EPERM;
+
+ r = fork_agent("(sd-askpwagent)",
+ NULL, 0,
+ &agent_pid,
+ SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH,
+ SYSTEMD_TTY_ASK_PASSWORD_AGENT_BINARY_PATH, "--watch", NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to fork TTY ask password agent: %m");
+
+ return 1;
+}
+
+void ask_password_agent_close(void) {
+
+ if (agent_pid <= 0)
+ return;
+
+ /* Inform agent that we are done */
+ sigterm_wait(TAKE_PID(agent_pid));
+}
+
+int ask_password_agent_open_if_enabled(BusTransport transport, bool ask_password) {
+
+ /* Open the ask password agent as a child process if necessary */
+
+ if (transport != BUS_TRANSPORT_LOCAL)
+ return 0;
+
+ if (!ask_password)
+ return 0;
+
+ return ask_password_agent_open();
+}
diff --git a/src/shared/spawn-ask-password-agent.h b/src/shared/spawn-ask-password-agent.h
new file mode 100644
index 0000000..a76cdb1
--- /dev/null
+++ b/src/shared/spawn-ask-password-agent.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bus-util.h"
+
+int ask_password_agent_open(void);
+void ask_password_agent_close(void);
+
+int ask_password_agent_open_if_enabled(BusTransport transport, bool ask_password);
diff --git a/src/shared/spawn-polkit-agent.c b/src/shared/spawn-polkit-agent.c
new file mode 100644
index 0000000..ce3c5fb
--- /dev/null
+++ b/src/shared/spawn-polkit-agent.c
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "exec-util.h"
+#include "fd-util.h"
+#include "io-util.h"
+#include "log.h"
+#include "macro.h"
+#include "process-util.h"
+#include "spawn-polkit-agent.h"
+#include "stdio-util.h"
+#include "time-util.h"
+
+#if ENABLE_POLKIT
+static pid_t agent_pid = 0;
+
+int polkit_agent_open(void) {
+ char notify_fd[DECIMAL_STR_MAX(int) + 1];
+ int pipe_fd[2], r;
+
+ if (agent_pid > 0)
+ return 0;
+
+ /* Clients that run as root don't need to activate/query polkit */
+ if (geteuid() == 0)
+ return 0;
+
+ /* We check STDIN here, not STDOUT, since this is about input, not output */
+ if (!isatty(STDIN_FILENO))
+ return 0;
+
+ if (!is_main_thread())
+ return -EPERM;
+
+ if (pipe2(pipe_fd, 0) < 0)
+ return -errno;
+
+ xsprintf(notify_fd, "%i", pipe_fd[1]);
+
+ r = fork_agent("(polkit-agent)",
+ &pipe_fd[1], 1,
+ &agent_pid,
+ POLKIT_AGENT_BINARY_PATH,
+ POLKIT_AGENT_BINARY_PATH, "--notify-fd", notify_fd, "--fallback", NULL);
+
+ /* Close the writing side, because that's the one for the agent */
+ safe_close(pipe_fd[1]);
+
+ if (r < 0)
+ log_error_errno(r, "Failed to fork TTY ask password agent: %m");
+ else
+ /* Wait until the agent closes the fd */
+ (void) fd_wait_for_event(pipe_fd[0], POLLHUP, USEC_INFINITY);
+
+ safe_close(pipe_fd[0]);
+
+ return r;
+}
+
+void polkit_agent_close(void) {
+
+ if (agent_pid <= 0)
+ return;
+
+ /* Inform agent that we are done */
+ sigterm_wait(TAKE_PID(agent_pid));
+}
+
+#else
+
+int polkit_agent_open(void) {
+ return 0;
+}
+
+void polkit_agent_close(void) {
+}
+
+#endif
+
+int polkit_agent_open_if_enabled(BusTransport transport, bool ask_password) {
+
+ /* Open the polkit agent as a child process if necessary */
+
+ if (transport != BUS_TRANSPORT_LOCAL)
+ return 0;
+
+ if (!ask_password)
+ return 0;
+
+ return polkit_agent_open();
+}
diff --git a/src/shared/spawn-polkit-agent.h b/src/shared/spawn-polkit-agent.h
new file mode 100644
index 0000000..325dfdd
--- /dev/null
+++ b/src/shared/spawn-polkit-agent.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bus-util.h"
+
+int polkit_agent_open(void);
+void polkit_agent_close(void);
+
+int polkit_agent_open_if_enabled(BusTransport transport, bool ask_password);
diff --git a/src/shared/specifier.c b/src/shared/specifier.c
new file mode 100644
index 0000000..e5a1f94
--- /dev/null
+++ b/src/shared/specifier.c
@@ -0,0 +1,498 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <sys/utsname.h>
+
+#include "sd-id128.h"
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "chase.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hostname-util.h"
+#include "id128-util.h"
+#include "macro.h"
+#include "os-util.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "specifier.h"
+#include "string-util.h"
+#include "strv.h"
+#include "user-util.h"
+
+/*
+ * Generic infrastructure for replacing %x style specifiers in
+ * strings. Will call a callback for each replacement.
+ */
+
+/* Any ASCII character or digit: our pool of potential specifiers,
+ * and "%" used for escaping. */
+#define POSSIBLE_SPECIFIERS ALPHANUMERICAL "%"
+
+int specifier_printf(const char *text, size_t max_length, const Specifier table[], const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *result = NULL;
+ bool percent = false;
+ size_t l;
+ char *t;
+ int r;
+
+ assert(ret);
+ assert(text);
+ assert(table);
+
+ l = strlen(text);
+ if (!GREEDY_REALLOC(result, l + 1))
+ return -ENOMEM;
+ t = result;
+
+ for (const char *f = text; *f != '\0'; f++, l--) {
+ if (percent) {
+ percent = false;
+
+ if (*f == '%')
+ *(t++) = '%';
+ else {
+ const Specifier *i;
+
+ for (i = table; i->specifier; i++)
+ if (i->specifier == *f)
+ break;
+
+ if (i->lookup) {
+ _cleanup_free_ char *w = NULL;
+ size_t k, j;
+
+ r = i->lookup(i->specifier, i->data, root, userdata, &w);
+ if (r < 0)
+ return r;
+ if (isempty(w))
+ continue;
+
+ j = t - result;
+ k = strlen(w);
+
+ if (!GREEDY_REALLOC(result, j + k + l + 1))
+ return -ENOMEM;
+ memcpy(result + j, w, k);
+ t = result + j + k;
+ } else if (strchr(POSSIBLE_SPECIFIERS, *f))
+ /* Oops, an unknown specifier. */
+ return -EBADSLT;
+ else {
+ *(t++) = '%';
+ *(t++) = *f;
+ }
+ }
+ } else if (*f == '%')
+ percent = true;
+ else
+ *(t++) = *f;
+
+ if ((size_t) (t - result) > max_length)
+ return -ENAMETOOLONG;
+ }
+
+ /* If string ended with a stray %, also end with % */
+ if (percent) {
+ *(t++) = '%';
+ if ((size_t) (t - result) > max_length)
+ return -ENAMETOOLONG;
+ }
+ *(t++) = 0;
+
+ *ret = TAKE_PTR(result);
+ return 0;
+}
+
+/* Generic handler for simple string replacements */
+
+int specifier_string(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ char *n = NULL;
+
+ assert(ret);
+
+ if (!isempty(data)) {
+ n = strdup(data);
+ if (!n)
+ return -ENOMEM;
+ }
+
+ *ret = n;
+ return 0;
+}
+
+int specifier_real_path(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const char *path = data;
+
+ assert(ret);
+
+ if (!path)
+ return -ENOENT;
+
+ return chase(path, root, 0, ret, NULL);
+}
+
+int specifier_real_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ _cleanup_free_ char *path = NULL;
+ int r;
+
+ assert(ret);
+
+ r = specifier_real_path(specifier, data, root, userdata, &path);
+ if (r < 0)
+ return r;
+
+ assert(path);
+ return path_extract_directory(path, ret);
+}
+
+int specifier_id128(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const sd_id128_t *id = ASSERT_PTR(data);
+ char *n;
+
+ n = new(char, SD_ID128_STRING_MAX);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = sd_id128_to_string(*id, n);
+ return 0;
+}
+
+int specifier_uuid(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const sd_id128_t *id = ASSERT_PTR(data);
+ char *n;
+
+ n = new(char, SD_ID128_UUID_STRING_MAX);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = sd_id128_to_uuid_string(*id, n);
+ return 0;
+}
+
+int specifier_uint64(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const uint64_t *n = ASSERT_PTR(data);
+
+ return asprintf(ret, "%" PRIu64, *n) < 0 ? -ENOMEM : 0;
+}
+
+int specifier_machine_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ sd_id128_t id;
+ int r;
+
+ assert(ret);
+
+ r = id128_get_machine(root, &id);
+ if (r < 0) /* Translate error for missing /etc/machine-id file to EUNATCH. */
+ return r == -ENOENT ? -EUNATCH : r;
+
+ return specifier_id128(specifier, &id, root, userdata, ret);
+}
+
+int specifier_boot_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ sd_id128_t id;
+ int r;
+
+ assert(ret);
+
+ r = sd_id128_get_boot(&id);
+ if (r < 0)
+ return r;
+
+ return specifier_id128(specifier, &id, root, userdata, ret);
+}
+
+int specifier_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ char *n;
+
+ assert(ret);
+
+ n = gethostname_malloc();
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+int specifier_short_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ char *n;
+
+ assert(ret);
+
+ n = gethostname_short_malloc();
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+int specifier_pretty_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ char *n = NULL;
+
+ assert(ret);
+
+ if (get_pretty_hostname(&n) < 0) {
+ n = gethostname_short_malloc();
+ if (!n)
+ return -ENOMEM;
+ }
+
+ *ret = n;
+ return 0;
+}
+
+int specifier_kernel_release(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ struct utsname uts;
+ char *n;
+
+ assert(ret);
+
+ if (uname(&uts) < 0)
+ return -errno;
+
+ n = strdup(uts.release);
+ if (!n)
+ return -ENOMEM;
+
+ *ret = n;
+ return 0;
+}
+
+int specifier_architecture(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ char *t;
+
+ assert(ret);
+
+ t = strdup(architecture_to_string(uname_architecture()));
+ if (!t)
+ return -ENOMEM;
+
+ *ret = t;
+ return 0;
+}
+
+/* Note: fields in /etc/os-release might quite possibly be missing, even if everything is entirely valid
+ * otherwise. We'll return an empty value or NULL in that case from the functions below. But if the
+ * os-release file is missing, we'll return -EUNATCH. This means that something is seriously wrong with the
+ * installation. */
+
+static int parse_os_release_specifier(const char *root, const char *id, char **ret) {
+ _cleanup_free_ char *v = NULL;
+ int r;
+
+ assert(ret);
+
+ r = parse_os_release(root, id, &v);
+ if (r >= 0)
+ /* parse_os_release() calls parse_env_file() which only sets the return value for
+ * entries found. Let's make sure we set the return value in all cases. */
+ *ret = TAKE_PTR(v);
+
+ /* Translate error for missing os-release file to EUNATCH. */
+ return r == -ENOENT ? -EUNATCH : r;
+}
+
+int specifier_os_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ return parse_os_release_specifier(root, "ID", ret);
+}
+
+int specifier_os_version_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ return parse_os_release_specifier(root, "VERSION_ID", ret);
+}
+
+int specifier_os_build_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ return parse_os_release_specifier(root, "BUILD_ID", ret);
+}
+
+int specifier_os_variant_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ return parse_os_release_specifier(root, "VARIANT_ID", ret);
+}
+
+int specifier_os_image_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ return parse_os_release_specifier(root, "IMAGE_ID", ret);
+}
+
+int specifier_os_image_version(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ return parse_os_release_specifier(root, "IMAGE_VERSION", ret);
+}
+
+int specifier_group_name(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ RuntimeScope scope = PTR_TO_INT(data);
+ char *t;
+
+ assert(ret);
+
+ if (scope == RUNTIME_SCOPE_GLOBAL)
+ return -EINVAL;
+
+ t = gid_to_name(scope == RUNTIME_SCOPE_USER ? getgid() : 0);
+ if (!t)
+ return -ENOMEM;
+
+ *ret = t;
+ return 0;
+}
+
+int specifier_group_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ RuntimeScope scope = PTR_TO_INT(data);
+ gid_t gid;
+
+ assert(ret);
+
+ if (scope == RUNTIME_SCOPE_GLOBAL)
+ return -EINVAL;
+
+ gid = scope == RUNTIME_SCOPE_USER ? getgid() : 0;
+
+ if (asprintf(ret, UID_FMT, gid) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int specifier_user_name(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ RuntimeScope scope = PTR_TO_INT(data);
+ uid_t uid;
+ char *t;
+
+ assert(ret);
+
+ if (scope == RUNTIME_SCOPE_GLOBAL)
+ return -EINVAL;
+
+ uid = scope == RUNTIME_SCOPE_USER ? getuid() : 0;
+
+ /* If we are UID 0 (root), this will not result in NSS, otherwise it might. This is good, as we want
+ * to be able to run this in PID 1, where our user ID is 0, but where NSS lookups are not allowed.
+
+ * We don't use getusername_malloc() here, because we don't want to look at $USER, to remain
+ * consistent with specifer_user_id() below.
+ */
+
+ t = uid_to_name(uid);
+ if (!t)
+ return -ENOMEM;
+
+ *ret = t;
+ return 0;
+}
+
+int specifier_user_id(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ RuntimeScope scope = PTR_TO_INT(data);
+ uid_t uid;
+
+ assert(ret);
+
+ if (scope == RUNTIME_SCOPE_GLOBAL)
+ return -EINVAL;
+
+ uid = scope == RUNTIME_SCOPE_USER ? getuid() : 0;
+
+ if (asprintf(ret, UID_FMT, uid) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+int specifier_user_home(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ assert(ret);
+
+ /* On PID 1 (which runs as root) this will not result in NSS,
+ * which is good. See above */
+
+ return get_home_dir(ret);
+}
+
+int specifier_user_shell(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ assert(ret);
+
+ /* On PID 1 (which runs as root) this will not result in NSS,
+ * which is good. See above */
+
+ return get_shell(ret);
+}
+
+int specifier_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const char *p;
+ char *copy;
+ int r;
+
+ assert(ret);
+
+ if (root) /* If root dir is set, don't honour $TMP or similar */
+ p = "/tmp";
+ else {
+ r = tmp_dir(&p);
+ if (r < 0)
+ return r;
+ }
+ copy = strdup(p);
+ if (!copy)
+ return -ENOMEM;
+
+ *ret = copy;
+ return 0;
+}
+
+int specifier_var_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret) {
+ const char *p;
+ char *copy;
+ int r;
+
+ assert(ret);
+
+ if (root)
+ p = "/var/tmp";
+ else {
+ r = var_tmp_dir(&p);
+ if (r < 0)
+ return r;
+ }
+ copy = strdup(p);
+ if (!copy)
+ return -ENOMEM;
+
+ *ret = copy;
+ return 0;
+}
+
+int specifier_escape_strv(char **l, char ***ret) {
+ _cleanup_strv_free_ char **z = NULL;
+ char **p, **q;
+
+ assert(ret);
+
+ if (strv_isempty(l)) {
+ *ret = NULL;
+ return 0;
+ }
+
+ z = new(char*, strv_length(l)+1);
+ if (!z)
+ return -ENOMEM;
+
+ for (p = l, q = z; *p; p++, q++) {
+
+ *q = specifier_escape(*p);
+ if (!*q)
+ return -ENOMEM;
+ }
+
+ *q = NULL;
+ *ret = TAKE_PTR(z);
+
+ return 0;
+}
+
+const Specifier system_and_tmp_specifier_table[] = {
+ COMMON_SYSTEM_SPECIFIERS,
+ COMMON_TMP_SPECIFIERS,
+ {}
+};
diff --git a/src/shared/specifier.h b/src/shared/specifier.h
new file mode 100644
index 0000000..df72bdc
--- /dev/null
+++ b/src/shared/specifier.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "string-util.h"
+
+typedef int (*SpecifierCallback)(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+
+typedef struct Specifier {
+ const char specifier;
+ const SpecifierCallback lookup;
+ const void *data;
+} Specifier;
+
+int specifier_printf(const char *text, size_t max_length, const Specifier table[], const char *root, const void *userdata, char **ret);
+
+int specifier_string(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_real_path(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_real_directory(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_id128(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_uuid(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_uint64(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+
+int specifier_machine_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_boot_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_short_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_pretty_hostname(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_kernel_release(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_architecture(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_os_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_os_version_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_os_build_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_os_variant_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_os_image_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_os_image_version(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+
+int specifier_group_name(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_group_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_user_name(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_user_id(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_user_home(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_user_shell(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+
+int specifier_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+int specifier_var_tmp_dir(char specifier, const void *data, const char *root, const void *userdata, char **ret);
+
+/* Typically, in places where one of the above specifier is to be resolved the other similar ones are to be
+ * resolved, too. Hence let's define common macros for the relevant array entries.
+ *
+ * COMMON_SYSTEM_SPECIFIERS:
+ * %a: the native userspace architecture
+ * %A: the OS image version, according to /etc/os-release
+ * %b: the boot ID of the running system
+ * %B: the OS build ID, according to /etc/os-release
+ * %H: the hostname of the running system
+ * %l: the short hostname of the running system
+ * %q: the 'pretty' hostname as per /etc/machine-info
+ * %m: the machine ID of the running system
+ * %M: the OS image ID, according to /etc/os-release
+ * %o: the OS ID according to /etc/os-release
+ * %v: the kernel version
+ * %w: the OS version ID, according to /etc/os-release
+ * %W: the OS variant ID, according to /etc/os-release
+ *
+ * COMMON_CREDS_SPECIFIERS:
+ * %g: the groupname of the running user
+ * %G: the GID of the running user
+ * %u: the username of the running user
+ * %U: the UID of the running user
+ *
+ * COMMON_TMP_SPECIFIERS:
+ * %T: the temporary directory (e.g. /tmp, or $TMPDIR, $TEMP, $TMP)
+ * %V: the temporary directory for large, persistent stuff (e.g. /var/tmp, or $TMPDIR, $TEMP, $TMP)
+ */
+
+#define COMMON_SYSTEM_SPECIFIERS \
+ { 'a', specifier_architecture, NULL }, \
+ { 'A', specifier_os_image_version, NULL }, \
+ { 'b', specifier_boot_id, NULL }, \
+ { 'B', specifier_os_build_id, NULL }, \
+ { 'H', specifier_hostname, NULL }, \
+ { 'l', specifier_short_hostname, NULL }, \
+ { 'q', specifier_pretty_hostname, NULL }, \
+ { 'm', specifier_machine_id, NULL }, \
+ { 'M', specifier_os_image_id, NULL }, \
+ { 'o', specifier_os_id, NULL }, \
+ { 'v', specifier_kernel_release, NULL }, \
+ { 'w', specifier_os_version_id, NULL }, \
+ { 'W', specifier_os_variant_id, NULL }
+
+#define COMMON_CREDS_SPECIFIERS(scope) \
+ { 'g', specifier_group_name, INT_TO_PTR(scope) }, \
+ { 'G', specifier_group_id, INT_TO_PTR(scope) }, \
+ { 'u', specifier_user_name, INT_TO_PTR(scope) }, \
+ { 'U', specifier_user_id, INT_TO_PTR(scope) }
+
+#define COMMON_TMP_SPECIFIERS \
+ { 'T', specifier_tmp_dir, NULL }, \
+ { 'V', specifier_var_tmp_dir, NULL }
+
+static inline char* specifier_escape(const char *string) {
+ return strreplace(string, "%", "%%");
+}
+
+int specifier_escape_strv(char **l, char ***ret);
+
+/* A generic specifier table consisting of COMMON_SYSTEM_SPECIFIERS and COMMON_TMP_SPECIFIERS */
+extern const Specifier system_and_tmp_specifier_table[];
diff --git a/src/shared/switch-root.c b/src/shared/switch-root.c
new file mode 100644
index 0000000..b620156
--- /dev/null
+++ b/src/shared/switch-root.c
@@ -0,0 +1,212 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "base-filesystem.h"
+#include "chase.h"
+#include "creds-util.h"
+#include "fd-util.h"
+#include "initrd-util.h"
+#include "log.h"
+#include "missing_syscall.h"
+#include "mkdir-label.h"
+#include "mount-util.h"
+#include "mountpoint-util.h"
+#include "path-util.h"
+#include "rm-rf.h"
+#include "stdio-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "switch-root.h"
+#include "user-util.h"
+
+int switch_root(const char *new_root,
+ const char *old_root_after, /* path below the new root, where to place the old root after the transition; may be NULL to unmount it */
+ SwitchRootFlags flags) {
+
+ /* Stuff mounted below /run/ we don't save on soft reboot, as it might have lost its relevance, i.e.
+ * credentials, removable media and such, we rather want that the new boot mounts this fresh. But on
+ * the switch from initrd we do use MS_REC, as it is expected that mounts set up in /run/ are
+ * maintained. */
+ static const struct {
+ const char *path;
+ unsigned long mount_flags; /* Flags to apply if SWITCH_ROOT_RECURSIVE_RUN is unset */
+ unsigned long mount_flags_recursive_run; /* Flags to apply if SWITCH_ROOT_RECURSIVE_RUN is set (0 if shall be skipped) */
+ } transfer_table[] = {
+ { "/dev", MS_BIND|MS_REC, MS_BIND|MS_REC }, /* Recursive, because we want to save the original /dev/shm/ + /dev/pts/ and similar */
+ { "/sys", MS_BIND|MS_REC, MS_BIND|MS_REC }, /* Similar, we want to retain various API VFS, or the cgroupv1 /sys/fs/cgroup/ tree */
+ { "/proc", MS_BIND|MS_REC, MS_BIND|MS_REC }, /* Similar */
+ { "/run", MS_BIND, MS_BIND|MS_REC }, /* Recursive except on soft reboot, see above */
+ { SYSTEM_CREDENTIALS_DIRECTORY, MS_BIND, 0 /* skip! */ }, /* Credentials passed into the system should survive */
+ { ENCRYPTED_SYSTEM_CREDENTIALS_DIRECTORY, MS_BIND, 0 /* skip! */ }, /* Similar */
+ { "/run/host", MS_BIND|MS_REC, 0 /* skip! */ }, /* Host supplied hierarchy should also survive */
+ };
+
+ _cleanup_close_ int old_root_fd = -EBADF, new_root_fd = -EBADF;
+ _cleanup_free_ char *resolved_old_root_after = NULL;
+ int r, istmp;
+
+ assert(new_root);
+
+ /* Check if we shall remove the contents of the old root */
+ old_root_fd = open("/", O_DIRECTORY|O_CLOEXEC);
+ if (old_root_fd < 0)
+ return log_error_errno(errno, "Failed to open root directory: %m");
+
+ new_root_fd = open(new_root, O_DIRECTORY|O_CLOEXEC);
+ if (new_root_fd < 0)
+ return log_error_errno(errno, "Failed to open target directory '%s': %m", new_root);
+
+ r = inode_same_at(old_root_fd, "", new_root_fd, "", AT_EMPTY_PATH);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine if old and new root directory are the same: %m");
+ if (r > 0) {
+ log_debug("Skipping switch root, as old and new root directory are the same.");
+ return 0;
+ }
+
+ /* Make the new root directory a mount point if it isn't */
+ r = fd_make_mount_point(new_root_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to make new root directory a mount point: %m");
+ if (r > 0) {
+ int fd;
+
+ /* When the path was not a mount point, then we need to reopen the path, otherwise, it still
+ * points to the underlying directory. */
+
+ fd = open(new_root, O_DIRECTORY|O_CLOEXEC);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to reopen target directory '%s': %m", new_root);
+
+ close_and_replace(new_root_fd, fd);
+ }
+
+ if (FLAGS_SET(flags, SWITCH_ROOT_DESTROY_OLD_ROOT)) {
+ istmp = fd_is_temporary_fs(old_root_fd);
+ if (istmp < 0)
+ return log_error_errno(istmp, "Failed to stat root directory: %m");
+ if (istmp > 0)
+ log_debug("Root directory is on tmpfs, will do cleanup later.");
+ } else
+ istmp = -1; /* don't know */
+
+ if (old_root_after) {
+ /* Determine where we shall place the old root after the transition */
+ r = chase(old_root_after, new_root, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &resolved_old_root_after, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, old_root_after);
+ if (r == 0) /* Doesn't exist yet. Let's create it */
+ (void) mkdir_p_label(resolved_old_root_after, 0755);
+ }
+
+ /* We are about to unmount various file systems with MNT_DETACH (either explicitly via umount() or
+ * indirectly via pivot_root()), and thus do not synchronously wait for them to be fully sync'ed —
+ * all while making them invisible/inaccessible in the file system tree for later code. That makes
+ * sync'ing them then difficult. Let's hence issue a manual sync() here, so that we at least can
+ * guarantee all file systems are an a good state before entering this state. */
+ if (!FLAGS_SET(flags, SWITCH_ROOT_DONT_SYNC))
+ sync();
+
+ /* Work-around for kernel design: the kernel refuses MS_MOVE if any file systems are mounted
+ * MS_SHARED. Hence remount them MS_PRIVATE here as a work-around.
+ *
+ * https://bugzilla.redhat.com/show_bug.cgi?id=847418 */
+ if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0)
+ return log_error_errno(errno, "Failed to set \"/\" mount propagation to private: %m");
+
+ /* Do not fail if base_filesystem_create() fails. Not all switch roots are like base_filesystem_create() wants
+ * them to look like. They might even boot, if they are RO and don't have the FS layout. Just ignore the error
+ * and switch_root() nevertheless. */
+ (void) base_filesystem_create_fd(new_root_fd, new_root, UID_INVALID, GID_INVALID);
+
+ FOREACH_ARRAY(transfer, transfer_table, ELEMENTSOF(transfer_table)) {
+ _cleanup_free_ char *chased = NULL;
+ unsigned long mount_flags;
+
+ mount_flags = FLAGS_SET(flags, SWITCH_ROOT_RECURSIVE_RUN) ? transfer->mount_flags_recursive_run : transfer->mount_flags;
+ if (mount_flags == 0) /* skip if zero */
+ continue;
+
+ if (access(transfer->path, F_OK) < 0) {
+ log_debug_errno(errno, "Path '%s' to move to target root directory, not found, ignoring: %m", transfer->path);
+ continue;
+ }
+
+ r = chase(transfer->path, new_root, CHASE_PREFIX_ROOT, &chased, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to resolve %s/%s: %m", new_root, transfer->path);
+
+ /* Let's see if it is a mount point already. */
+ r = path_is_mount_point(chased, NULL, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine whether %s is a mount point: %m", chased);
+ if (r > 0) /* If it is already mounted, then do nothing */
+ continue;
+
+ r = mount_nofollow_verbose(LOG_ERR, transfer->path, chased, NULL, mount_flags, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (fchdir(new_root_fd) < 0)
+ return log_error_errno(errno, "Failed to change directory to %s: %m", new_root);
+
+ /* We first try a pivot_root() so that we can umount the old root dir. In many cases (i.e. where rootfs is /),
+ * that's not possible however, and hence we simply overmount root */
+ if (resolved_old_root_after)
+ r = RET_NERRNO(pivot_root(".", resolved_old_root_after));
+ else {
+ r = RET_NERRNO(pivot_root(".", "."));
+ if (r >= 0) {
+ /* Now unmount the upper of the two stacked file systems */
+ if (umount2(".", MNT_DETACH) < 0)
+ return log_error_errno(errno, "Failed to unmount the old root: %m");
+ }
+ }
+ if (r < 0) {
+ log_debug_errno(r, "Pivoting root file system failed, moving mounts instead: %m");
+
+ if (resolved_old_root_after) {
+ r = mount_nofollow_verbose(LOG_ERR, "/", resolved_old_root_after, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ /* If we have to use MS_MOVE let's first try to get rid of *all* mounts we can, with the
+ * exception of the path we want to switch to, plus everything leading to it and within
+ * it. This is necessary because unlike pivot_root() just moving the mount to the root via
+ * MS_MOVE won't magically unmount anything below it. Once the chroot() succeeds the mounts
+ * below would still be around but invisible to us, because not accessible via
+ * /proc/self/mountinfo. Hence, let's clean everything up first, as long as we still can. */
+ (void) umount_recursive_full(NULL, MNT_DETACH, STRV_MAKE(new_root));
+
+ if (mount(".", "/", NULL, MS_MOVE, NULL) < 0)
+ return log_error_errno(errno, "Failed to move %s to /: %m", new_root);
+
+ if (chroot(".") < 0)
+ return log_error_errno(errno, "Failed to change root: %m");
+
+ if (chdir(".") < 0)
+ return log_error_errno(errno, "Failed to change directory: %m");
+ }
+
+ if (istmp > 0) {
+ struct stat rb;
+
+ if (fstat(old_root_fd, &rb) < 0)
+ return log_error_errno(errno, "Failed to stat old root directory: %m");
+
+ /* Note: the below won't operate on non-memory file systems (i.e. only on tmpfs, ramfs), and
+ * it will stop at mount boundaries */
+ (void) rm_rf_children(TAKE_FD(old_root_fd), 0, &rb); /* takes possession of the dir fd, even on failure */
+ }
+
+ return 0;
+}
diff --git a/src/shared/switch-root.h b/src/shared/switch-root.h
new file mode 100644
index 0000000..ba0d280
--- /dev/null
+++ b/src/shared/switch-root.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+typedef enum SwitchRootFlags {
+ SWITCH_ROOT_DESTROY_OLD_ROOT = 1 << 0, /* rm -rf old root when switching – under the condition
+ * that it is backed by non-persistent tmpfs/ramfs/… */
+ SWITCH_ROOT_DONT_SYNC = 1 << 1, /* don't call sync() immediately before switching root */
+ SWITCH_ROOT_RECURSIVE_RUN = 1 << 2, /* move /run/ with MS_REC from old to new root */
+} SwitchRootFlags;
+
+int switch_root(const char *new_root, const char *old_root_after, SwitchRootFlags flags);
diff --git a/src/shared/test-tables.h b/src/shared/test-tables.h
new file mode 100644
index 0000000..3f20318
--- /dev/null
+++ b/src/shared/test-tables.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "string-util.h"
+
+#define _test_table(name, lookup, reverse, size, sparse) \
+ for (int64_t _i = -EINVAL, _boring = 0; _i < size + 1; _i++) { \
+ const char* _val; \
+ int64_t _rev; \
+ \
+ _val = lookup(_i); \
+ if (_val) { \
+ _rev = reverse(_val); \
+ _boring = 0; \
+ } else { \
+ _rev = reverse("--no-such--value----"); \
+ _boring += _i >= 0; \
+ } \
+ if (_boring == 0 || _i == size) \
+ printf("%s: %" PRIi64 " → %s → %" PRIi64 "\n", name, _i, strnull(_val), _rev); \
+ else if (_boring == 1) \
+ printf("%*s ...\n", (int) strlen(name), ""); \
+ \
+ if (_i >= 0 && _i < size) { \
+ if (sparse) \
+ assert_se(_rev == _i || _rev == -EINVAL); \
+ else \
+ assert_se(_val && _rev == _i); \
+ } else \
+ assert_se(!_val && _rev == -EINVAL); \
+ }
+
+#define test_table(lower, upper) \
+ _test_table(STRINGIFY(lower), lower##_to_string, lower##_from_string, _##upper##_MAX, false)
+
+#define test_table_sparse(lower, upper) \
+ _test_table(STRINGIFY(lower), lower##_to_string, lower##_from_string, _##upper##_MAX, true)
diff --git a/src/shared/tests.c b/src/shared/tests.c
new file mode 100644
index 0000000..3882a18
--- /dev/null
+++ b/src/shared/tests.c
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+
+#include "sd-bus.h"
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "bus-locator.h"
+#include "bus-util.h"
+#include "bus-wait-for-jobs.h"
+#include "cgroup-setup.h"
+#include "cgroup-util.h"
+#include "env-file.h"
+#include "env-util.h"
+#include "fd-util.h"
+#include "fs-util.h"
+#include "log.h"
+#include "namespace-util.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "random-util.h"
+#include "strv.h"
+#include "tests.h"
+#include "tmpfile-util.h"
+
+char* setup_fake_runtime_dir(void) {
+ char t[] = "/tmp/fake-xdg-runtime-XXXXXX", *p;
+
+ assert_se(mkdtemp(t));
+ assert_se(setenv("XDG_RUNTIME_DIR", t, 1) >= 0);
+ assert_se(p = strdup(t));
+
+ return p;
+}
+
+static void load_testdata_env(void) {
+ static bool called = false;
+ _cleanup_free_ char *s = NULL, *d = NULL, *envpath = NULL;
+ _cleanup_strv_free_ char **pairs = NULL;
+ int r;
+
+ if (called)
+ return;
+ called = true;
+
+ assert_se(readlink_and_make_absolute("/proc/self/exe", &s) >= 0);
+ assert_se(path_extract_directory(s, &d) >= 0);
+ assert_se(envpath = path_join(d, "systemd-runtest.env"));
+
+ r = load_env_file_pairs(NULL, envpath, &pairs);
+ if (r < 0) {
+ log_debug_errno(r, "Reading %s failed: %m", envpath);
+ return;
+ }
+
+ STRV_FOREACH_PAIR(k, v, pairs)
+ assert_se(setenv(*k, *v, 0) >= 0);
+}
+
+int get_testdata_dir(const char *suffix, char **ret) {
+ const char *dir;
+ char *p;
+
+ load_testdata_env();
+
+ /* if the env var is set, use that */
+ dir = getenv("SYSTEMD_TEST_DATA");
+ if (!dir)
+ dir = SYSTEMD_TEST_DATA;
+ if (access(dir, F_OK) < 0)
+ return log_error_errno(errno, "ERROR: $SYSTEMD_TEST_DATA directory [%s] not accessible: %m", dir);
+
+ p = path_join(dir, suffix);
+ if (!p)
+ return log_oom();
+
+ *ret = p;
+ return 0;
+}
+
+const char* get_catalog_dir(void) {
+ const char *env;
+
+ load_testdata_env();
+
+ /* if the env var is set, use that */
+ env = getenv("SYSTEMD_CATALOG_DIR");
+ if (!env)
+ env = SYSTEMD_CATALOG_DIR;
+ if (access(env, F_OK) < 0) {
+ fprintf(stderr, "ERROR: $SYSTEMD_CATALOG_DIR directory [%s] does not exist\n", env);
+ exit(EXIT_FAILURE);
+ }
+ return env;
+}
+
+bool slow_tests_enabled(void) {
+ int r;
+
+ r = getenv_bool("SYSTEMD_SLOW_TESTS");
+ if (r >= 0)
+ return r;
+
+ if (r != -ENXIO)
+ log_warning_errno(r, "Cannot parse $SYSTEMD_SLOW_TESTS, ignoring.");
+ return SYSTEMD_SLOW_TESTS_DEFAULT;
+}
+
+void test_setup_logging(int level) {
+ log_set_max_level(level);
+ log_parse_environment();
+ log_open();
+}
+
+int write_tmpfile(char *pattern, const char *contents) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(pattern);
+ assert(contents);
+
+ fd = mkostemp_safe(pattern);
+ if (fd < 0)
+ return fd;
+
+ ssize_t l = strlen(contents);
+ errno = 0;
+ if (write(fd, contents, l) != l)
+ return errno_or_else(EIO);
+ return 0;
+}
+
+bool have_namespaces(void) {
+ siginfo_t si = {};
+ pid_t pid;
+
+ /* Checks whether namespaces are available. In some cases they aren't. We do this by calling unshare(), and we
+ * do so in a child process in order not to affect our own process. */
+
+ pid = fork();
+ assert_se(pid >= 0);
+
+ if (pid == 0) {
+ /* child */
+ if (detach_mount_namespace() < 0)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ assert_se(waitid(P_PID, pid, &si, WEXITED) >= 0);
+ assert_se(si.si_code == CLD_EXITED);
+
+ if (si.si_status == EXIT_SUCCESS)
+ return true;
+
+ if (si.si_status == EXIT_FAILURE)
+ return false;
+
+ assert_not_reached();
+}
+
+bool can_memlock(void) {
+ /* Let's see if we can mlock() a larger blob of memory. BPF programs are charged against
+ * RLIMIT_MEMLOCK, hence let's first make sure we can lock memory at all, and skip the test if we
+ * cannot. Why not check RLIMIT_MEMLOCK explicitly? Because in container environments the
+ * RLIMIT_MEMLOCK value we see might not match the RLIMIT_MEMLOCK value actually in effect. */
+
+ void *p = mmap(NULL, CAN_MEMLOCK_SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_SHARED, -1, 0);
+ if (p == MAP_FAILED)
+ return false;
+
+ bool b = mlock(p, CAN_MEMLOCK_SIZE) >= 0;
+ if (b)
+ assert_se(munlock(p, CAN_MEMLOCK_SIZE) >= 0);
+
+ assert_se(munmap(p, CAN_MEMLOCK_SIZE) >= 0);
+ return b;
+}
+
+static int allocate_scope(void) {
+ _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL;
+ _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
+ _cleanup_free_ char *scope = NULL;
+ const char *object;
+ int r;
+
+ /* Let's try to run this test in a scope of its own, with delegation turned on, so that PID 1 doesn't
+ * interfere with our cgroup management. */
+
+ r = sd_bus_default_system(&bus);
+ if (r < 0)
+ return log_error_errno(r, "Failed to connect to system bus: %m");
+
+ r = bus_wait_for_jobs_new(bus, &w);
+ if (r < 0)
+ return log_error_errno(r, "Could not watch jobs: %m");
+
+ if (asprintf(&scope, "%s-%" PRIx64 ".scope", program_invocation_short_name, random_u64()) < 0)
+ return log_oom();
+
+ r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ /* Name and Mode */
+ r = sd_bus_message_append(m, "ss", scope, "fail");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ /* Properties */
+ r = sd_bus_message_open_container(m, 'a', "(sv)");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, (uint32_t) getpid_cached());
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "(sv)", "Delegate", "b", 1);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_append(m, "(sv)", "CollectMode", "s", "inactive-or-failed");
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_message_close_container(m);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ /* Auxiliary units */
+ r = sd_bus_message_append(m, "a(sa(sv))", 0);
+ if (r < 0)
+ return bus_log_create_error(r);
+
+ r = sd_bus_call(bus, m, 0, &error, &reply);
+ if (r < 0)
+ return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r));
+
+ r = sd_bus_message_read(reply, "o", &object);
+ if (r < 0)
+ return bus_log_parse_error(r);
+
+ r = bus_wait_for_jobs_one(w, object, false, NULL);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int enter_cgroup(char **ret_cgroup, bool enter_subroot) {
+ _cleanup_free_ char *cgroup_root = NULL, *cgroup_subroot = NULL;
+ CGroupMask supported;
+ int r;
+
+ r = allocate_scope();
+ if (r < 0)
+ log_warning_errno(r, "Couldn't allocate a scope unit for this test, proceeding without.");
+
+ r = cg_pid_get_path(NULL, 0, &cgroup_root);
+ if (r == -ENOMEDIUM)
+ return log_warning_errno(r, "cg_pid_get_path(NULL, 0, ...) failed: %m");
+ assert(r >= 0);
+
+ if (enter_subroot)
+ assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0);
+ else {
+ cgroup_subroot = strdup(cgroup_root);
+ assert_se(cgroup_subroot != NULL);
+ }
+
+ assert_se(cg_mask_supported(&supported) >= 0);
+
+ /* If this fails, then we don't mind as the later cgroup operations will fail too, and it's fine if
+ * we handle any errors at that point. */
+
+ r = cg_create_everywhere(supported, _CGROUP_MASK_ALL, cgroup_subroot);
+ if (r < 0)
+ return r;
+
+ r = cg_attach_everywhere(supported, cgroup_subroot, 0, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ if (ret_cgroup)
+ *ret_cgroup = TAKE_PTR(cgroup_subroot);
+
+ return 0;
+}
+
+int enter_cgroup_subroot(char **ret_cgroup) {
+ return enter_cgroup(ret_cgroup, true);
+}
+
+int enter_cgroup_root(char **ret_cgroup) {
+ return enter_cgroup(ret_cgroup, false);
+}
+
+const char *ci_environment(void) {
+ /* We return a string because we might want to provide multiple bits of information later on: not
+ * just the general CI environment type, but also whether we're sanitizing or not, etc. The caller is
+ * expected to use strstr on the returned value. */
+ static const char *ans = POINTER_MAX;
+ int r;
+
+ if (ans != POINTER_MAX)
+ return ans;
+
+ /* We allow specifying the environment with $CITYPE. Nobody uses this so far, but we are ready. */
+ const char *citype = getenv("CITYPE");
+ if (!isempty(citype))
+ return (ans = citype);
+
+ if (getenv_bool("TRAVIS") > 0)
+ return (ans = "travis");
+ if (getenv_bool("SEMAPHORE") > 0)
+ return (ans = "semaphore");
+ if (getenv_bool("GITHUB_ACTIONS") > 0)
+ return (ans = "github-actions");
+ if (getenv("AUTOPKGTEST_ARTIFACTS") || getenv("AUTOPKGTEST_TMP"))
+ return (ans = "autopkgtest");
+ if (getenv("SALSA_CI_IMAGES"))
+ return (ans = "salsa-ci");
+
+ FOREACH_STRING(var, "CI", "CONTINOUS_INTEGRATION") {
+ /* Those vars are booleans according to Semaphore and Travis docs:
+ * https://docs.travis-ci.com/user/environment-variables/#default-environment-variables
+ * https://docs.semaphoreci.com/ci-cd-environment/environment-variables/#ci
+ */
+ r = getenv_bool(var);
+ if (r > 0)
+ return (ans = "unknown"); /* Some other unknown thing */
+ if (r == 0)
+ return (ans = NULL);
+ }
+
+ return (ans = NULL);
+}
diff --git a/src/shared/tests.h b/src/shared/tests.h
new file mode 100644
index 0000000..d76cf2e
--- /dev/null
+++ b/src/shared/tests.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "sd-daemon.h"
+
+#include "argv-util.h"
+#include "macro.h"
+#include "static-destruct.h"
+#include "strv.h"
+
+static inline bool manager_errno_skip_test(int r) {
+ return IN_SET(abs(r),
+ EPERM,
+ EACCES,
+ EADDRINUSE,
+ EHOSTDOWN,
+ ENOENT,
+ ENOMEDIUM /* cannot determine cgroup */
+ );
+}
+
+char* setup_fake_runtime_dir(void);
+int enter_cgroup_subroot(char **ret_cgroup);
+int enter_cgroup_root(char **ret_cgroup);
+int get_testdata_dir(const char *suffix, char **ret);
+const char* get_catalog_dir(void);
+bool slow_tests_enabled(void);
+void test_setup_logging(int level);
+
+#define log_tests_skipped(fmt, ...) \
+ ({ \
+ log_notice("%s: " fmt ", skipping tests.", \
+ program_invocation_short_name, \
+ ##__VA_ARGS__); \
+ EXIT_TEST_SKIP; \
+ })
+
+#define log_tests_skipped_errno(error, fmt, ...) \
+ ({ \
+ log_notice_errno(error, \
+ "%s: " fmt ", skipping tests: %m", \
+ program_invocation_short_name, \
+ ##__VA_ARGS__); \
+ EXIT_TEST_SKIP; \
+ })
+
+int write_tmpfile(char *pattern, const char *contents);
+
+bool have_namespaces(void);
+
+/* We use the small but non-trivial limit here */
+#define CAN_MEMLOCK_SIZE (512 * 1024U)
+bool can_memlock(void);
+
+/* Define void* buffer and size_t length variables from a hex string. */
+#define DEFINE_HEX_PTR(name, hex) \
+ _cleanup_free_ void *name = NULL; \
+ size_t name##_len = 0; \
+ assert_se(unhexmem(hex, strlen_ptr(hex), &name, &name##_len) >= 0);
+
+#define TEST_REQ_RUNNING_SYSTEMD(x) \
+ if (sd_booted() > 0) { \
+ x; \
+ } else { \
+ printf("systemd not booted, skipping '%s'\n", #x); \
+ }
+
+/* Provide a convenient way to check if we're running in CI. */
+const char *ci_environment(void);
+
+typedef struct TestFunc {
+ union f {
+ void (*void_func)(void);
+ int (*int_func)(void);
+ } f;
+ const char * const name;
+ bool has_ret:1;
+ bool sd_booted:1;
+} TestFunc;
+
+/* See static-destruct.h for an explanation of how this works. */
+#define REGISTER_TEST(func, ...) \
+ _Pragma("GCC diagnostic ignored \"-Wattributes\"") \
+ _section_("SYSTEMD_TEST_TABLE") _alignptr_ _used_ _retain_ _variable_no_sanitize_address_ \
+ static const TestFunc UNIQ_T(static_test_table_entry, UNIQ) = { \
+ .f = (union f) &(func), \
+ .name = STRINGIFY(func), \
+ .has_ret = __builtin_types_compatible_p(typeof((union f){}.int_func), typeof(&(func))), \
+ ##__VA_ARGS__ \
+ }
+
+extern const TestFunc _weak_ __start_SYSTEMD_TEST_TABLE[];
+extern const TestFunc _weak_ __stop_SYSTEMD_TEST_TABLE[];
+
+#define TEST(name, ...) \
+ static void test_##name(void); \
+ REGISTER_TEST(test_##name, ##__VA_ARGS__); \
+ static void test_##name(void)
+
+#define TEST_RET(name, ...) \
+ static int test_##name(void); \
+ REGISTER_TEST(test_##name, ##__VA_ARGS__); \
+ static int test_##name(void)
+
+#define TEST_LOG_FUNC() \
+ log_info("/* %s() */", __func__)
+
+static inline int run_test_table(void) {
+ _cleanup_strv_free_ char **tests = NULL;
+ int r = EXIT_SUCCESS;
+ bool ran = false;
+ const char *e;
+
+ if (!__start_SYSTEMD_TEST_TABLE)
+ return r;
+
+ e = getenv("TESTFUNCS");
+ if (e) {
+ r = strv_split_full(&tests, e, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse $TESTFUNCS: %m");
+ }
+
+ for (const TestFunc *t = ALIGN_PTR(__start_SYSTEMD_TEST_TABLE);
+ t + 1 <= __stop_SYSTEMD_TEST_TABLE;
+ t = ALIGN_PTR(t + 1)) {
+
+ if (tests && !strv_contains(tests, t->name))
+ continue;
+
+ if (t->sd_booted && sd_booted() <= 0) {
+ log_info("/* systemd not booted, skipping %s */", t->name);
+ if (t->has_ret && r == EXIT_SUCCESS)
+ r = EXIT_TEST_SKIP;
+ } else {
+ log_info("/* %s */", t->name);
+
+ if (t->has_ret) {
+ int r2 = t->f.int_func();
+ if (r == EXIT_SUCCESS)
+ r = r2;
+ } else
+ t->f.void_func();
+ }
+
+ ran = true;
+ }
+
+ if (!ran)
+ return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "No matching tests found.");
+
+ return r;
+}
+
+#define DEFINE_TEST_MAIN_FULL(log_level, intro, outro) \
+ int main(int argc, char *argv[]) { \
+ int (*_intro)(void) = intro; \
+ int (*_outro)(void) = outro; \
+ int _r, _q; \
+ test_setup_logging(log_level); \
+ save_argc_argv(argc, argv); \
+ _r = _intro ? _intro() : EXIT_SUCCESS; \
+ if (_r == EXIT_SUCCESS) \
+ _r = run_test_table(); \
+ _q = _outro ? _outro() : EXIT_SUCCESS; \
+ static_destruct(); \
+ if (_r < 0) \
+ return EXIT_FAILURE; \
+ if (_r != EXIT_SUCCESS) \
+ return _r; \
+ if (_q < 0) \
+ return EXIT_FAILURE; \
+ return _q; \
+ }
+
+#define DEFINE_TEST_MAIN_WITH_INTRO(log_level, intro) \
+ DEFINE_TEST_MAIN_FULL(log_level, intro, NULL)
+#define DEFINE_TEST_MAIN(log_level) \
+ DEFINE_TEST_MAIN_FULL(log_level, NULL, NULL)
diff --git a/src/shared/tmpfile-util-label.c b/src/shared/tmpfile-util-label.c
new file mode 100644
index 0000000..a5f364c
--- /dev/null
+++ b/src/shared/tmpfile-util-label.c
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/stat.h>
+
+#include "selinux-util.h"
+#include "tmpfile-util-label.h"
+#include "tmpfile-util.h"
+
+int fopen_temporary_at_label(
+ int dir_fd,
+ const char *target,
+ const char *path,
+ FILE **f,
+ char **temp_path) {
+
+ int r;
+
+ assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
+ assert(path);
+
+ r = mac_selinux_create_file_prepare_at(dir_fd, target, S_IFREG);
+ if (r < 0)
+ return r;
+
+ r = fopen_temporary_at(dir_fd, path, f, temp_path);
+
+ mac_selinux_create_file_clear();
+
+ return r;
+}
diff --git a/src/shared/tmpfile-util-label.h b/src/shared/tmpfile-util-label.h
new file mode 100644
index 0000000..68ab075
--- /dev/null
+++ b/src/shared/tmpfile-util-label.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <fcntl.h>
+#include <stdio.h>
+
+/* These functions are split out of tmpfile-util.h (and not for example just flags to the functions they
+ * wrap) in order to optimize linking: this way, -lselinux is needed only for the callers of these functions
+ * that need selinux, but not for all. */
+
+int fopen_temporary_at_label(int dir_fd, const char *target, const char *path, FILE **f, char **temp_path);
+static inline int fopen_temporary_label(const char *target, const char *path, FILE **f, char **temp_path) {
+ return fopen_temporary_at_label(AT_FDCWD, target, path, f, temp_path);
+}
diff --git a/src/shared/tomoyo-util.c b/src/shared/tomoyo-util.c
new file mode 100644
index 0000000..2347179
--- /dev/null
+++ b/src/shared/tomoyo-util.c
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+
+#include "tomoyo-util.h"
+
+bool mac_tomoyo_use(void) {
+ static int cached_use = -1;
+
+ if (cached_use < 0)
+ cached_use = (access("/sys/kernel/security/tomoyo/version",
+ F_OK) == 0);
+
+ return cached_use;
+}
diff --git a/src/shared/tomoyo-util.h b/src/shared/tomoyo-util.h
new file mode 100644
index 0000000..a6ee7d4
--- /dev/null
+++ b/src/shared/tomoyo-util.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+bool mac_tomoyo_use(void);
diff --git a/src/shared/tpm2-event-log.c b/src/shared/tpm2-event-log.c
new file mode 100644
index 0000000..2e23846
--- /dev/null
+++ b/src/shared/tpm2-event-log.c
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "tpm2-event-log.h"
+
+#include "sort-util.h"
+
+typedef struct tpm2_log_event_type_info {
+ uint32_t event_type;
+ const char *name;
+} tpm2_log_event_type_info;
+
+static tpm2_log_event_type_info tpm2_log_event_type_table[] = {
+ /* Unfortunately the types are defined all over the place, hence we are not using a dense table
+ * here.
+ *
+ * Keep this sorted by event type, so that we can do bisection! */
+ { EV_PREBOOT_CERT, "preboot-cert" },
+ { EV_POST_CODE, "post-code" },
+ { EV_NO_ACTION, "no-action" },
+ { EV_SEPARATOR, "separator" },
+ { EV_ACTION, "action" },
+ { EV_EVENT_TAG, "event-tag" },
+ { EV_S_CRTM_CONTENTS, "s-crtm-contents" },
+ { EV_S_CRTM_VERSION, "s-crtm-version" },
+ { EV_CPU_MICROCODE, "cpu-microcode" },
+ { EV_PLATFORM_CONFIG_FLAGS, "platform-config-flags" },
+ { EV_TABLE_OF_DEVICES, "table-of-devices" },
+ { EV_COMPACT_HASH, "compact-hash" },
+ { EV_IPL, "ipl" },
+ { EV_IPL_PARTITION_DATA, "ipl-partition-data" },
+ { EV_NONHOST_CODE, "nonhost-code" },
+ { EV_NONHOST_CONFIG, "nonhost-config" },
+ { EV_NONHOST_INFO, "nonhost-info" },
+ { EV_OMIT_BOOT_DEVICE_EVENTS, "omit-boot-device-events" },
+ /* omitting EV_EFI_EVENT_BASE, since its not an event, but just a base value for other events */
+ { EV_EFI_VARIABLE_DRIVER_CONFIG, "efi-variable-driver-config" },
+ { EV_EFI_VARIABLE_BOOT, "efi-variable-boot" },
+ { EV_EFI_BOOT_SERVICES_APPLICATION, "efi-boot-services-application" },
+ { EV_EFI_BOOT_SERVICES_DRIVER, "efi-boot-services-driver" },
+ { EV_EFI_RUNTIME_SERVICES_DRIVER, "efi-runtime-services-driver" },
+ { EV_EFI_GPT_EVENT, "efi-gpt-event" },
+ { EV_EFI_ACTION, "efi-action" },
+ { EV_EFI_PLATFORM_FIRMWARE_BLOB, "efi-platform-firmware-blob" },
+ { EV_EFI_HANDOFF_TABLES, "efi-handoff-tables" },
+ { EV_EFI_PLATFORM_FIRMWARE_BLOB2, "efi-platform-firmware-blob2" },
+ { EV_EFI_HANDOFF_TABLES2, "efi-handoff-tables" },
+ { EV_EFI_VARIABLE_BOOT2, "efi-variable-boot2" },
+ { EV_EFI_HCRTM_EVENT, "efi-hcrtm-event" },
+ { EV_EFI_VARIABLE_AUTHORITY, "efi-variable-authority" },
+ { EV_EFI_SPDM_FIRMWARE_BLOB, "efi-spdm-firmware-blob" },
+ { EV_EFI_SPDM_FIRMWARE_CONFIG, "efi-spdm-firmware-config" },
+};
+
+static int tpm2_log_event_type_info_cmp(const tpm2_log_event_type_info *a, const tpm2_log_event_type_info *b) {
+ return CMP(ASSERT_PTR(a)->event_type, ASSERT_PTR(b)->event_type);
+}
+
+const char *tpm2_log_event_type_to_string(uint32_t type) {
+
+ tpm2_log_event_type_info *found, key = {
+ .event_type = type,
+ };
+
+ found = typesafe_bsearch(&key, tpm2_log_event_type_table, ELEMENTSOF(tpm2_log_event_type_table), tpm2_log_event_type_info_cmp);
+
+ return found ? found->name : NULL;
+}
diff --git a/src/shared/tpm2-event-log.h b/src/shared/tpm2-event-log.h
new file mode 100644
index 0000000..916b805
--- /dev/null
+++ b/src/shared/tpm2-event-log.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <uchar.h>
+
+#include "tpm2-util.h"
+
+/* Definitions as per "TCG PC Client Specific Platform Firmware Profile Specification"
+ * (https://trustedcomputinggroup.org/resource/pc-client-specific-platform-firmware-profile-specification/),
+ * section 10.4.1 "Event Types" (at least in version 1.05 Revision 23 of the spec) */
+#ifndef EV_PREBOOT_CERT
+#define EV_PREBOOT_CERT UINT32_C(0x00000000)
+#define EV_POST_CODE UINT32_C(0x00000001)
+#define EV_NO_ACTION UINT32_C(0x00000003)
+#define EV_SEPARATOR UINT32_C(0x00000004)
+#define EV_ACTION UINT32_C(0x00000005)
+#define EV_EVENT_TAG UINT32_C(0x00000006)
+#define EV_S_CRTM_CONTENTS UINT32_C(0x00000007)
+#define EV_S_CRTM_VERSION UINT32_C(0x00000008)
+#define EV_CPU_MICROCODE UINT32_C(0x00000009)
+#define EV_PLATFORM_CONFIG_FLAGS UINT32_C(0x0000000a)
+#define EV_TABLE_OF_DEVICES UINT32_C(0x0000000b)
+#define EV_COMPACT_HASH UINT32_C(0x0000000c)
+#define EV_IPL UINT32_C(0x0000000d)
+#define EV_IPL_PARTITION_DATA UINT32_C(0x0000000e)
+#define EV_NONHOST_CODE UINT32_C(0x0000000f)
+#define EV_NONHOST_CONFIG UINT32_C(0x00000010)
+#define EV_NONHOST_INFO UINT32_C(0x00000011)
+#define EV_OMIT_BOOT_DEVICE_EVENTS UINT32_C(0x00000012)
+#define EV_EFI_EVENT_BASE UINT32_C(0x80000000)
+#define EV_EFI_VARIABLE_DRIVER_CONFIG UINT32_C(0x80000001)
+#define EV_EFI_VARIABLE_BOOT UINT32_C(0x80000002)
+#define EV_EFI_BOOT_SERVICES_APPLICATION UINT32_C(0x80000003)
+#define EV_EFI_BOOT_SERVICES_DRIVER UINT32_C(0x80000004)
+#define EV_EFI_RUNTIME_SERVICES_DRIVER UINT32_C(0x80000005)
+#define EV_EFI_GPT_EVENT UINT32_C(0x80000006)
+#define EV_EFI_ACTION UINT32_C(0x80000007)
+#define EV_EFI_PLATFORM_FIRMWARE_BLOB UINT32_C(0x80000008)
+#define EV_EFI_HANDOFF_TABLES UINT32_C(0x80000009)
+#define EV_EFI_PLATFORM_FIRMWARE_BLOB2 UINT32_C(0x8000000A)
+#define EV_EFI_HANDOFF_TABLES2 UINT32_C(0x8000000B)
+#define EV_EFI_VARIABLE_BOOT2 UINT32_C(0x8000000C)
+#define EV_EFI_HCRTM_EVENT UINT32_C(0x80000010)
+#define EV_EFI_VARIABLE_AUTHORITY UINT32_C(0x800000E0)
+#define EV_EFI_SPDM_FIRMWARE_BLOB UINT32_C(0x800000E1)
+#define EV_EFI_SPDM_FIRMWARE_CONFIG UINT32_C(0x800000E2)
+#endif
+
+/* Defined in drivers/firmware/efi/libstub/efistub.h in the Linux kernel sources */
+#ifndef INITRD_EVENT_TAG_ID
+#define INITRD_EVENT_TAG_ID UINT32_C(0x8F3B22EC)
+#endif
+
+#ifndef LOAD_OPTIONS_EVENT_TAG_ID
+#define LOAD_OPTIONS_EVENT_TAG_ID UINT32_C(0x8F3B22ED)
+#endif
+
+const char *tpm2_log_event_type_to_string(uint32_t type) _const_;
+
+#if HAVE_TPM2
+
+/* UEFI event log data structures */
+typedef struct _packed_ TCG_PCClientPCREvent {
+ uint32_t pcrIndex;
+ uint32_t eventType;
+ uint8_t digest[20];
+ uint32_t eventDataSize;
+ uint32_t event[];
+} TCG_PCClientPCREvent;
+
+typedef struct _packed_ packed_TPMT_HA {
+ uint16_t hashAlg;
+ TPMU_HA digest;
+} packed_TPMT_HA;
+
+typedef struct _packed_ packed_TPML_DIGEST_VALUES {
+ uint32_t count;
+ packed_TPMT_HA digests[];
+} packed_TPML_DIGEST_VALUES;
+
+typedef struct _packed_ TCG_PCR_EVENT2 {
+ uint32_t pcrIndex;
+ uint32_t eventType;
+ packed_TPML_DIGEST_VALUES digests;
+ /* … */
+} TCG_PCR_EVENT2;
+
+typedef struct _packed_ TCG_EfiSpecIdEventAlgorithmSize {
+ uint16_t algorithmId;
+ uint16_t digestSize;
+} TCG_EfiSpecIdEventAlgorithmSize;
+
+typedef struct _packed_ tdTCG_EfiSpecIdEvent {
+ uint8_t signature[16];
+ uint32_t platformClass;
+ uint8_t specVersionMinor;
+ uint8_t specVersionMajor;
+ uint8_t specErrata;
+ uint8_t uintnSize;
+ uint32_t numberOfAlgorithms;
+ TCG_EfiSpecIdEventAlgorithmSize digestSizes[];
+ /* … */
+} TCG_EfiSpecIDEvent;
+
+typedef struct _packed_ UEFI_VARIABLE_DATA {
+ uint8_t variableName[16];
+ uint64_t unicodeNameLength;
+ uint64_t variableDataLength;
+ char16_t unicodeName[];
+ /* … */
+} UEFI_VARIABLE_DATA;
+
+typedef struct _packed_ TCG_PCClientTaggedEvent{
+ uint32_t taggedEventID;
+ uint32_t taggedEventDataSize;
+ uint8_t taggedEventData[];
+} TCG_PCClientTaggedEvent;
+
+typedef struct _packed_ packed_EFI_DEVICE_PATH {
+ uint8_t type;
+ uint8_t subType;
+ uint16_t length;
+ uint8_t path[];
+} packed_EFI_DEVICE_PATH;
+
+typedef struct _packed_ UEFI_IMAGE_LOAD_EVENT {
+ uint64_t imageLocationInMemory;
+ uint64_t imageLengthInMemory;
+ uint64_t imageLinkTimeAddress;
+ uint64_t lengthOfDevicePath;
+ packed_EFI_DEVICE_PATH devicePath[];
+} UEFI_IMAGE_LOAD_EVENT;
+
+typedef struct _packed_ UEFI_PLATFORM_FIRMWARE_BLOB {
+ uint64_t blobBase;
+ uint64_t blobLength;
+} UEFI_PLATFORM_FIRMWARE_BLOB;
+
+#endif
diff --git a/src/shared/tpm2-util.c b/src/shared/tpm2-util.c
new file mode 100644
index 0000000..30b4f57
--- /dev/null
+++ b/src/shared/tpm2-util.c
@@ -0,0 +1,7664 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/file.h>
+
+#include "alloc-util.h"
+#include "constants.h"
+#include "cryptsetup-util.h"
+#include "dirent-util.h"
+#include "dlfcn-util.h"
+#include "efi-api.h"
+#include "extract-word.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-table.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "hmac.h"
+#include "initrd-util.h"
+#include "io-util.h"
+#include "lock-util.h"
+#include "log.h"
+#include "logarithm.h"
+#include "memory-util.h"
+#include "mkdir.h"
+#include "nulstr-util.h"
+#include "parse-util.h"
+#include "random-util.h"
+#include "sha256.h"
+#include "sort-util.h"
+#include "stat-util.h"
+#include "string-table.h"
+#include "sync-util.h"
+#include "time-util.h"
+#include "tpm2-util.h"
+#include "virt.h"
+
+#if HAVE_TPM2
+static void *libtss2_esys_dl = NULL;
+static void *libtss2_rc_dl = NULL;
+static void *libtss2_mu_dl = NULL;
+
+static TSS2_RC (*sym_Esys_Create)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE_CREATE *inSensitive, const TPM2B_PUBLIC *inPublic, const TPM2B_DATA *outsideInfo, const TPML_PCR_SELECTION *creationPCR, TPM2B_PRIVATE **outPrivate, TPM2B_PUBLIC **outPublic, TPM2B_CREATION_DATA **creationData, TPM2B_DIGEST **creationHash, TPMT_TK_CREATION **creationTicket) = NULL;
+static TSS2_RC (*sym_Esys_CreateLoaded)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE_CREATE *inSensitive, const TPM2B_TEMPLATE *inPublic, ESYS_TR *objectHandle, TPM2B_PRIVATE **outPrivate, TPM2B_PUBLIC **outPublic) = NULL;
+static TSS2_RC (*sym_Esys_CreatePrimary)(ESYS_CONTEXT *esysContext, ESYS_TR primaryHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE_CREATE *inSensitive, const TPM2B_PUBLIC *inPublic, const TPM2B_DATA *outsideInfo, const TPML_PCR_SELECTION *creationPCR, ESYS_TR *objectHandle, TPM2B_PUBLIC **outPublic, TPM2B_CREATION_DATA **creationData, TPM2B_DIGEST **creationHash, TPMT_TK_CREATION **creationTicket) = NULL;
+static TSS2_RC (*sym_Esys_EvictControl)(ESYS_CONTEXT *esysContext, ESYS_TR auth, ESYS_TR objectHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPMI_DH_PERSISTENT persistentHandle, ESYS_TR *newObjectHandle) = NULL;
+static void (*sym_Esys_Finalize)(ESYS_CONTEXT **context) = NULL;
+static TSS2_RC (*sym_Esys_FlushContext)(ESYS_CONTEXT *esysContext, ESYS_TR flushHandle) = NULL;
+static void (*sym_Esys_Free)(void *ptr) = NULL;
+static TSS2_RC (*sym_Esys_GetCapability)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2_CAP capability, UINT32 property, UINT32 propertyCount, TPMI_YES_NO *moreData, TPMS_CAPABILITY_DATA **capabilityData) = NULL;
+static TSS2_RC (*sym_Esys_GetRandom)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, UINT16 bytesRequested, TPM2B_DIGEST **randomBytes) = NULL;
+static TSS2_RC (*sym_Esys_Import)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DATA *encryptionKey, const TPM2B_PUBLIC *objectPublic, const TPM2B_PRIVATE *duplicate, const TPM2B_ENCRYPTED_SECRET *inSymSeed, const TPMT_SYM_DEF_OBJECT *symmetricAlg, TPM2B_PRIVATE **outPrivate) = NULL;
+static TSS2_RC (*sym_Esys_Initialize)(ESYS_CONTEXT **esys_context, TSS2_TCTI_CONTEXT *tcti, TSS2_ABI_VERSION *abiVersion) = NULL;
+static TSS2_RC (*sym_Esys_Load)(ESYS_CONTEXT *esysContext, ESYS_TR parentHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_PRIVATE *inPrivate, const TPM2B_PUBLIC *inPublic, ESYS_TR *objectHandle) = NULL;
+static TSS2_RC (*sym_Esys_LoadExternal)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_SENSITIVE *inPrivate, const TPM2B_PUBLIC *inPublic, ESYS_TR hierarchy, ESYS_TR *objectHandle) = NULL;
+static TSS2_RC (*sym_Esys_NV_DefineSpace)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_AUTH *auth, const TPM2B_NV_PUBLIC *publicInfo, ESYS_TR *nvHandle);
+static TSS2_RC (*sym_Esys_NV_UndefineSpace)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR nvIndex, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3);
+static TSS2_RC (*sym_Esys_NV_Write)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR nvIndex, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_MAX_NV_BUFFER *data, UINT16 offset);
+static TSS2_RC (*sym_Esys_PCR_Extend)(ESYS_CONTEXT *esysContext, ESYS_TR pcrHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPML_DIGEST_VALUES *digests) = NULL;
+static TSS2_RC (*sym_Esys_PCR_Read)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1,ESYS_TR shandle2, ESYS_TR shandle3, const TPML_PCR_SELECTION *pcrSelectionIn, UINT32 *pcrUpdateCounter, TPML_PCR_SELECTION **pcrSelectionOut, TPML_DIGEST **pcrValues) = NULL;
+static TSS2_RC (*sym_Esys_PolicyAuthValue)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3) = NULL;
+static TSS2_RC (*sym_Esys_PolicyAuthorize)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DIGEST *approvedPolicy, const TPM2B_NONCE *policyRef, const TPM2B_NAME *keySign, const TPMT_TK_VERIFIED *checkTicket) = NULL;
+static TSS2_RC (*sym_Esys_PolicyAuthorizeNV)(ESYS_CONTEXT *esysContext, ESYS_TR authHandle, ESYS_TR nvIndex, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3);
+static TSS2_RC (*sym_Esys_PolicyGetDigest)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2B_DIGEST **policyDigest) = NULL;
+static TSS2_RC (*sym_Esys_PolicyOR)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPML_DIGEST *pHashList) = NULL;
+static TSS2_RC (*sym_Esys_PolicyPCR)(ESYS_CONTEXT *esysContext, ESYS_TR policySession, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DIGEST *pcrDigest, const TPML_PCR_SELECTION *pcrs) = NULL;
+static TSS2_RC (*sym_Esys_ReadPublic)(ESYS_CONTEXT *esysContext, ESYS_TR objectHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2B_PUBLIC **outPublic, TPM2B_NAME **name, TPM2B_NAME **qualifiedName) = NULL;
+static TSS2_RC (*sym_Esys_StartAuthSession)(ESYS_CONTEXT *esysContext, ESYS_TR tpmKey, ESYS_TR bind, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_NONCE *nonceCaller, TPM2_SE sessionType, const TPMT_SYM_DEF *symmetric, TPMI_ALG_HASH authHash, ESYS_TR *sessionHandle) = NULL;
+static TSS2_RC (*sym_Esys_Startup)(ESYS_CONTEXT *esysContext, TPM2_SU startupType) = NULL;
+static TSS2_RC (*sym_Esys_TestParms)(ESYS_CONTEXT *esysContext, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPMT_PUBLIC_PARMS *parameters) = NULL;
+static TSS2_RC (*sym_Esys_TR_Close)(ESYS_CONTEXT *esys_context, ESYS_TR *rsrc_handle) = NULL;
+static TSS2_RC (*sym_Esys_TR_Deserialize)(ESYS_CONTEXT *esys_context, uint8_t const *buffer, size_t buffer_size, ESYS_TR *esys_handle) = NULL;
+static TSS2_RC (*sym_Esys_TR_FromTPMPublic)(ESYS_CONTEXT *esysContext, TPM2_HANDLE tpm_handle, ESYS_TR optionalSession1, ESYS_TR optionalSession2, ESYS_TR optionalSession3, ESYS_TR *object) = NULL;
+static TSS2_RC (*sym_Esys_TR_GetName)(ESYS_CONTEXT *esysContext, ESYS_TR handle, TPM2B_NAME **name) = NULL;
+static TSS2_RC (*sym_Esys_TR_GetTpmHandle)(ESYS_CONTEXT *esys_context, ESYS_TR esys_handle, TPM2_HANDLE *tpm_handle) = NULL;
+static TSS2_RC (*sym_Esys_TR_Serialize)(ESYS_CONTEXT *esys_context, ESYS_TR object, uint8_t **buffer, size_t *buffer_size) = NULL;
+static TSS2_RC (*sym_Esys_TR_SetAuth)(ESYS_CONTEXT *esysContext, ESYS_TR handle, TPM2B_AUTH const *authValue) = NULL;
+static TSS2_RC (*sym_Esys_TRSess_GetAttributes)(ESYS_CONTEXT *esysContext, ESYS_TR session, TPMA_SESSION *flags) = NULL;
+static TSS2_RC (*sym_Esys_TRSess_SetAttributes)(ESYS_CONTEXT *esysContext, ESYS_TR session, TPMA_SESSION flags, TPMA_SESSION mask) = NULL;
+static TSS2_RC (*sym_Esys_Unseal)(ESYS_CONTEXT *esysContext, ESYS_TR itemHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, TPM2B_SENSITIVE_DATA **outData) = NULL;
+static TSS2_RC (*sym_Esys_VerifySignature)(ESYS_CONTEXT *esysContext, ESYS_TR keyHandle, ESYS_TR shandle1, ESYS_TR shandle2, ESYS_TR shandle3, const TPM2B_DIGEST *digest, const TPMT_SIGNATURE *signature, TPMT_TK_VERIFIED **validation) = NULL;
+
+static TSS2_RC (*sym_Tss2_MU_TPM2_CC_Marshal)(TPM2_CC src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2_HANDLE_Marshal)(TPM2_HANDLE src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_DIGEST_Marshal)(TPM2B_DIGEST const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Marshal)(TPM2B_ENCRYPTED_SECRET const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_ENCRYPTED_SECRET *dest) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_NAME_Marshal)(TPM2B_NAME const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_PRIVATE_Marshal)(TPM2B_PRIVATE const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_PRIVATE_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_PRIVATE *dest) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_PUBLIC_Marshal)(TPM2B_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_PUBLIC *dest) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_SENSITIVE_Marshal)(TPM2B_SENSITIVE const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPML_PCR_SELECTION_Marshal)(TPML_PCR_SELECTION const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPMS_NV_PUBLIC_Marshal)(TPMS_NV_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_NV_PUBLIC_Marshal)(TPM2B_NV_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPM2B_NV_PUBLIC_Unmarshal)(uint8_t const buffer[], size_t buffer_size, size_t *offset, TPM2B_NV_PUBLIC *dest) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPMS_ECC_POINT_Marshal)(TPMS_ECC_POINT const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPMT_HA_Marshal)(TPMT_HA const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_TPMT_PUBLIC_Marshal)(TPMT_PUBLIC const *src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+static TSS2_RC (*sym_Tss2_MU_UINT32_Marshal)(UINT32 src, uint8_t buffer[], size_t buffer_size, size_t *offset) = NULL;
+
+static const char* (*sym_Tss2_RC_Decode)(TSS2_RC rc) = NULL;
+
+int dlopen_tpm2(void) {
+ int r;
+
+ r = dlopen_many_sym_or_warn(
+ &libtss2_esys_dl, "libtss2-esys.so.0", LOG_DEBUG,
+ DLSYM_ARG(Esys_Create),
+ DLSYM_ARG(Esys_CreateLoaded),
+ DLSYM_ARG(Esys_CreatePrimary),
+ DLSYM_ARG(Esys_EvictControl),
+ DLSYM_ARG(Esys_Finalize),
+ DLSYM_ARG(Esys_FlushContext),
+ DLSYM_ARG(Esys_Free),
+ DLSYM_ARG(Esys_GetCapability),
+ DLSYM_ARG(Esys_GetRandom),
+ DLSYM_ARG(Esys_Import),
+ DLSYM_ARG(Esys_Initialize),
+ DLSYM_ARG(Esys_Load),
+ DLSYM_ARG(Esys_LoadExternal),
+ DLSYM_ARG(Esys_NV_DefineSpace),
+ DLSYM_ARG(Esys_NV_UndefineSpace),
+ DLSYM_ARG(Esys_NV_Write),
+ DLSYM_ARG(Esys_PCR_Extend),
+ DLSYM_ARG(Esys_PCR_Read),
+ DLSYM_ARG(Esys_PolicyAuthValue),
+ DLSYM_ARG(Esys_PolicyAuthorize),
+ DLSYM_ARG(Esys_PolicyAuthorizeNV),
+ DLSYM_ARG(Esys_PolicyGetDigest),
+ DLSYM_ARG(Esys_PolicyOR),
+ DLSYM_ARG(Esys_PolicyPCR),
+ DLSYM_ARG(Esys_ReadPublic),
+ DLSYM_ARG(Esys_StartAuthSession),
+ DLSYM_ARG(Esys_Startup),
+ DLSYM_ARG(Esys_TestParms),
+ DLSYM_ARG(Esys_TR_Close),
+ DLSYM_ARG(Esys_TR_Deserialize),
+ DLSYM_ARG(Esys_TR_FromTPMPublic),
+ DLSYM_ARG(Esys_TR_GetName),
+ DLSYM_ARG(Esys_TR_Serialize),
+ DLSYM_ARG(Esys_TR_SetAuth),
+ DLSYM_ARG(Esys_TRSess_GetAttributes),
+ DLSYM_ARG(Esys_TRSess_SetAttributes),
+ DLSYM_ARG(Esys_Unseal),
+ DLSYM_ARG(Esys_VerifySignature));
+ if (r < 0)
+ return r;
+
+ /* Esys_TR_GetTpmHandle was added to tpm2-tss in version 2.4.0. Once we can set a minimum tpm2-tss
+ * version of 2.4.0 this sym can be moved up to the normal list above. */
+ r = dlsym_many_or_warn(libtss2_esys_dl, LOG_DEBUG, DLSYM_ARG_FORCE(Esys_TR_GetTpmHandle));
+ if (r < 0)
+ log_debug("libtss2-esys too old, does not include Esys_TR_GetTpmHandle.");
+
+ r = dlopen_many_sym_or_warn(
+ &libtss2_rc_dl, "libtss2-rc.so.0", LOG_DEBUG,
+ DLSYM_ARG(Tss2_RC_Decode));
+ if (r < 0)
+ return r;
+
+ return dlopen_many_sym_or_warn(
+ &libtss2_mu_dl, "libtss2-mu.so.0", LOG_DEBUG,
+ DLSYM_ARG(Tss2_MU_TPM2_CC_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2_HANDLE_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_DIGEST_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_ENCRYPTED_SECRET_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_ENCRYPTED_SECRET_Unmarshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_NAME_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_PRIVATE_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_PRIVATE_Unmarshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_PUBLIC_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_PUBLIC_Unmarshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_SENSITIVE_Marshal),
+ DLSYM_ARG(Tss2_MU_TPML_PCR_SELECTION_Marshal),
+ DLSYM_ARG(Tss2_MU_TPMS_NV_PUBLIC_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_NV_PUBLIC_Marshal),
+ DLSYM_ARG(Tss2_MU_TPM2B_NV_PUBLIC_Unmarshal),
+ DLSYM_ARG(Tss2_MU_TPMS_ECC_POINT_Marshal),
+ DLSYM_ARG(Tss2_MU_TPMT_HA_Marshal),
+ DLSYM_ARG(Tss2_MU_TPMT_PUBLIC_Marshal),
+ DLSYM_ARG(Tss2_MU_UINT32_Marshal));
+}
+
+void Esys_Freep(void *p) {
+ if (*(void**) p)
+ sym_Esys_Free(*(void**) p);
+}
+
+/* Get a specific TPM capability (or capabilities).
+ *
+ * Returns 0 if there are no more capability properties of the requested type, or 1 if there are more, or < 0
+ * on any error. Both 0 and 1 indicate this completed successfully, but do not indicate how many capability
+ * properties were provided in 'ret_capability_data'. To find the number of provided properties, check the
+ * specific type's 'count' field (e.g. for TPM2_CAP_ALGS, check ret_capability_data->algorithms.count).
+ *
+ * This calls TPM2_GetCapability() and does not alter the provided data, so it is important to understand how
+ * that TPM function works. It is recommended to check the TCG TPM specification Part 3 ("Commands") section
+ * on TPM2_GetCapability() for full details, but a short summary is: if this returns 0, all available
+ * properties have been provided in ret_capability_data, or no properties were available. If this returns 1,
+ * there are between 1 and "count" properties provided in ret_capability_data, and there are more available.
+ * Note that this may provide less than "count" properties even if the TPM has more available. Also, each
+ * capability category may have more specific requirements than described here; see the spec for exact
+ * details. */
+static int tpm2_get_capability(
+ Tpm2Context *c,
+ TPM2_CAP capability,
+ uint32_t property,
+ uint32_t count,
+ TPMU_CAPABILITIES *ret_capability_data) {
+
+ _cleanup_(Esys_Freep) TPMS_CAPABILITY_DATA *capabilities = NULL;
+ TPMI_YES_NO more;
+ TSS2_RC rc;
+
+ assert(c);
+
+ log_debug("Getting TPM2 capability 0x%04" PRIx32 " property 0x%04" PRIx32 " count %" PRIu32 ".",
+ capability, property, count);
+
+ rc = sym_Esys_GetCapability(
+ c->esys_context,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ capability,
+ property,
+ count,
+ &more,
+ &capabilities);
+ if (rc == TPM2_RC_VALUE)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENXIO),
+ "Requested TPM2 capability 0x%04" PRIx32 " property 0x%04" PRIx32 " apparently doesn't exist: %s",
+ capability, property, sym_Tss2_RC_Decode(rc));
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to get TPM2 capability 0x%04" PRIx32 " property 0x%04" PRIx32 ": %s",
+ capability, property, sym_Tss2_RC_Decode(rc));
+ if (capabilities->capability != capability)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "TPM provided wrong capability: 0x%04" PRIx32 " instead of 0x%04" PRIx32 ".",
+ capabilities->capability, capability);
+
+ if (ret_capability_data)
+ *ret_capability_data = capabilities->data;
+
+ return more == TPM2_YES;
+}
+
+#define TPMA_CC_TO_TPM2_CC(cca) (((cca) & TPMA_CC_COMMANDINDEX_MASK) >> TPMA_CC_COMMANDINDEX_SHIFT)
+
+static int tpm2_cache_capabilities(Tpm2Context *c) {
+ TPMU_CAPABILITIES capability;
+ int r;
+
+ assert(c);
+
+ /* Cache the algorithms. The spec indicates supported algorithms can only be modified during runtime
+ * by the SetAlgorithmSet() command. Unfortunately, the spec doesn't require a TPM reinitialization
+ * after changing the algorithm set (unless the PCR algorithms are changed). However, the spec also
+ * indicates the TPM behavior after SetAlgorithmSet() is "vendor-dependent", giving the example of
+ * flushing sessions and objects, erasing policies, etc. So, if the algorithm set is programmatically
+ * changed while we are performing some operation, it's reasonable to assume it will break us even if
+ * we don't cache the algorithms, thus they should be "safe" to cache. */
+ TPM2_ALG_ID current_alg = TPM2_ALG_FIRST;
+ for (;;) {
+ r = tpm2_get_capability(
+ c,
+ TPM2_CAP_ALGS,
+ (uint32_t) current_alg, /* The spec states to cast TPM2_ALG_ID to uint32_t. */
+ TPM2_MAX_CAP_ALGS,
+ &capability);
+ if (r < 0)
+ return r;
+
+ TPML_ALG_PROPERTY algorithms = capability.algorithms;
+
+ /* We should never get 0; the TPM must support some algorithms, and it must not set 'more' if
+ * there are no more. */
+ assert(algorithms.count > 0);
+
+ if (!GREEDY_REALLOC_APPEND(
+ c->capability_algorithms,
+ c->n_capability_algorithms,
+ algorithms.algProperties,
+ algorithms.count))
+ return log_oom_debug();
+
+ if (r == 0)
+ break;
+
+ /* Set current_alg to alg id after last alg id the TPM provided */
+ current_alg = algorithms.algProperties[algorithms.count - 1].alg + 1;
+ }
+
+ /* Cache the command capabilities. The spec isn't actually clear if commands can be added/removed
+ * while running, but that would be crazy, so let's hope it is not possible. */
+ TPM2_CC current_cc = TPM2_CC_FIRST;
+ for (;;) {
+ r = tpm2_get_capability(
+ c,
+ TPM2_CAP_COMMANDS,
+ current_cc,
+ TPM2_MAX_CAP_CC,
+ &capability);
+ if (r < 0)
+ return r;
+
+ TPML_CCA commands = capability.command;
+
+ /* We should never get 0; the TPM must support some commands, and it must not set 'more' if
+ * there are no more. */
+ assert(commands.count > 0);
+
+ if (!GREEDY_REALLOC_APPEND(
+ c->capability_commands,
+ c->n_capability_commands,
+ commands.commandAttributes,
+ commands.count))
+ return log_oom_debug();
+
+ if (r == 0)
+ break;
+
+ /* Set current_cc to index after last cc the TPM provided */
+ current_cc = TPMA_CC_TO_TPM2_CC(commands.commandAttributes[commands.count - 1]) + 1;
+ }
+
+ /* Cache the ECC curves. The spec isn't actually clear if ECC curves can be added/removed
+ * while running, but that would be crazy, so let's hope it is not possible. */
+ TPM2_ECC_CURVE current_ecc_curve = TPM2_ECC_NONE;
+ for (;;) {
+ r = tpm2_get_capability(
+ c,
+ TPM2_CAP_ECC_CURVES,
+ current_ecc_curve,
+ TPM2_MAX_ECC_CURVES,
+ &capability);
+ if (r == -ENXIO) /* If the TPM doesn't support ECC, it might return TPM2_RC_VALUE rather than capability.eccCurves == 0 */
+ break;
+ if (r < 0)
+ return r;
+
+ TPML_ECC_CURVE ecc_curves = capability.eccCurves;
+
+ /* ECC support isn't required */
+ if (ecc_curves.count == 0)
+ break;
+
+ if (!GREEDY_REALLOC_APPEND(
+ c->capability_ecc_curves,
+ c->n_capability_ecc_curves,
+ ecc_curves.eccCurves,
+ ecc_curves.count))
+ return log_oom_debug();
+
+ if (r == 0)
+ break;
+
+ /* Set current_ecc_curve to index after last ecc curve the TPM provided */
+ current_ecc_curve = ecc_curves.eccCurves[ecc_curves.count - 1] + 1;
+ }
+
+ /* Cache the PCR capabilities, which are safe to cache, as the only way they can change is
+ * TPM2_PCR_Allocate(), which changes the allocation after the next _TPM_Init(). If the TPM is
+ * reinitialized while we are using it, all our context and sessions will be invalid, so we can
+ * safely assume the TPM PCR allocation will not change while we are using it. */
+ r = tpm2_get_capability(
+ c,
+ TPM2_CAP_PCRS,
+ /* property= */ 0,
+ /* count= */ 1,
+ &capability);
+ if (r < 0)
+ return r;
+ if (r == 1)
+ /* This should never happen. Part 3 ("Commands") of the TCG TPM2 spec in the section for
+ * TPM2_GetCapability states: "TPM_CAP_PCRS – Returns the current allocation of PCR in a
+ * TPML_PCR_SELECTION. The property parameter shall be zero. The TPM will always respond to
+ * this command with the full PCR allocation and moreData will be NO." */
+ log_debug("TPM bug: reported multiple PCR sets; using only first set.");
+ c->capability_pcrs = capability.assignedPCR;
+
+ return 0;
+}
+
+/* Get the TPMA_ALGORITHM for a TPM2_ALG_ID. Returns true if the TPM supports the algorithm and the
+ * TPMA_ALGORITHM is provided, otherwise false. */
+static bool tpm2_get_capability_alg(Tpm2Context *c, TPM2_ALG_ID alg, TPMA_ALGORITHM *ret) {
+ assert(c);
+
+ FOREACH_ARRAY(alg_prop, c->capability_algorithms, c->n_capability_algorithms)
+ if (alg_prop->alg == alg) {
+ if (ret)
+ *ret = alg_prop->algProperties;
+ return true;
+ }
+
+ log_debug("TPM does not support alg 0x%02" PRIx16 ".", alg);
+ if (ret)
+ *ret = 0;
+
+ return false;
+}
+
+bool tpm2_supports_alg(Tpm2Context *c, TPM2_ALG_ID alg) {
+ return tpm2_get_capability_alg(c, alg, NULL);
+}
+
+/* Get the TPMA_CC for a TPM2_CC. Returns true if the TPM supports the command and the TPMA_CC is provided,
+ * otherwise false. */
+static bool tpm2_get_capability_command(Tpm2Context *c, TPM2_CC command, TPMA_CC *ret) {
+ assert(c);
+
+ FOREACH_ARRAY(cca, c->capability_commands, c->n_capability_commands)
+ if (TPMA_CC_TO_TPM2_CC(*cca) == command) {
+ if (ret)
+ *ret = *cca;
+ return true;
+ }
+
+ log_debug("TPM does not support command 0x%04" PRIx32 ".", command);
+ if (ret)
+ *ret = 0;
+
+ return false;
+}
+
+bool tpm2_supports_command(Tpm2Context *c, TPM2_CC command) {
+ return tpm2_get_capability_command(c, command, NULL);
+}
+
+/* Returns true if the TPM supports the ECC curve, otherwise false. */
+bool tpm2_supports_ecc_curve(Tpm2Context *c, TPM2_ECC_CURVE ecc_curve) {
+ assert(c);
+
+ FOREACH_ARRAY(curve, c->capability_ecc_curves, c->n_capability_ecc_curves)
+ if (*curve == ecc_curve)
+ return true;
+
+ log_debug("TPM does not support ECC curve 0x%" PRIx16 ".", ecc_curve);
+ return false;
+}
+
+/* Query the TPM for populated handles.
+ *
+ * This provides an array of handle indexes populated in the TPM, starting at the requested handle. The array will
+ * contain only populated handle addresses (which might not include the requested handle). The number of
+ * handles will be no more than the 'max' number requested. This will not search past the end of the handle
+ * range (i.e. handle & 0xff000000).
+ *
+ * Returns 0 if all populated handles in the range (starting at the requested handle) were provided (or no
+ * handles were in the range), or 1 if there are more populated handles in the range, or < 0 on any error. */
+static int tpm2_get_capability_handles(
+ Tpm2Context *c,
+ TPM2_HANDLE start,
+ size_t max,
+ TPM2_HANDLE **ret_handles,
+ size_t *ret_n_handles) {
+
+ _cleanup_free_ TPM2_HANDLE *handles = NULL;
+ size_t n_handles = 0;
+ TPM2_HANDLE current = start;
+ int r = 0;
+
+ assert(c);
+ assert(ret_handles);
+ assert(ret_n_handles);
+
+ max = MIN(max, UINT32_MAX);
+
+ while (max > 0) {
+ TPMU_CAPABILITIES capability;
+ r = tpm2_get_capability(c, TPM2_CAP_HANDLES, current, (uint32_t) max, &capability);
+ if (r < 0)
+ return r;
+
+ TPML_HANDLE handle_list = capability.handles;
+ if (handle_list.count == 0)
+ break;
+
+ assert(handle_list.count <= max);
+
+ if (n_handles > SIZE_MAX - handle_list.count)
+ return log_oom_debug();
+
+ if (!GREEDY_REALLOC_APPEND(handles, n_handles, handle_list.handle, handle_list.count))
+ return log_oom_debug();
+
+ max -= handle_list.count;
+
+ /* Update current to the handle index after the last handle in the list. */
+ current = handles[n_handles - 1] + 1;
+
+ if (r == 0)
+ /* No more handles in this range. */
+ break;
+ }
+
+ *ret_handles = TAKE_PTR(handles);
+ *ret_n_handles = n_handles;
+
+ return r;
+}
+
+#define TPM2_HANDLE_RANGE(h) ((TPM2_HANDLE)((h) & TPM2_HR_RANGE_MASK))
+#define TPM2_HANDLE_TYPE(h) ((TPM2_HT)(TPM2_HANDLE_RANGE(h) >> TPM2_HR_SHIFT))
+
+/* Returns 1 if the handle is populated in the TPM, 0 if not, and < 0 on any error. */
+static int tpm2_get_capability_handle(Tpm2Context *c, TPM2_HANDLE handle) {
+ _cleanup_free_ TPM2_HANDLE *handles = NULL;
+ size_t n_handles = 0;
+ int r;
+
+ r = tpm2_get_capability_handles(c, handle, 1, &handles, &n_handles);
+ if (r < 0)
+ return r;
+
+ return n_handles == 0 ? false : handles[0] == handle;
+}
+
+/* Returns 1 if the TPM supports the parms, or 0 if the TPM does not support the parms. */
+bool tpm2_test_parms(Tpm2Context *c, TPMI_ALG_PUBLIC alg, const TPMU_PUBLIC_PARMS *parms) {
+ TSS2_RC rc;
+
+ assert(c);
+ assert(parms);
+
+ TPMT_PUBLIC_PARMS parameters = {
+ .type = alg,
+ .parameters = *parms,
+ };
+
+ rc = sym_Esys_TestParms(c->esys_context, ESYS_TR_NONE, ESYS_TR_NONE, ESYS_TR_NONE, &parameters);
+ if (rc != TSS2_RC_SUCCESS)
+ /* The spec says if the parms are not supported the TPM returns "...the appropriate
+ * unmarshaling error if a parameter is not valid". Since the spec (currently) defines 15
+ * unmarshaling errors, instead of checking for them all here, let's just assume any error
+ * indicates unsupported parms, and log the specific error text. */
+ log_debug("TPM does not support tested parms: %s", sym_Tss2_RC_Decode(rc));
+
+ return rc == TSS2_RC_SUCCESS;
+}
+
+static bool tpm2_supports_tpmt_public(Tpm2Context *c, const TPMT_PUBLIC *public) {
+ assert(c);
+ assert(public);
+
+ return tpm2_test_parms(c, public->type, &public->parameters);
+}
+
+static bool tpm2_supports_tpmt_sym_def_object(Tpm2Context *c, const TPMT_SYM_DEF_OBJECT *parameters) {
+ assert(c);
+ assert(parameters);
+
+ TPMU_PUBLIC_PARMS parms = {
+ .symDetail.sym = *parameters,
+ };
+
+ return tpm2_test_parms(c, TPM2_ALG_SYMCIPHER, &parms);
+}
+
+static bool tpm2_supports_tpmt_sym_def(Tpm2Context *c, const TPMT_SYM_DEF *parameters) {
+ assert(c);
+ assert(parameters);
+
+ /* Unfortunately, TPMT_SYM_DEF and TPMT_SYM_DEF_OBEJECT are separately defined, even though they are
+ * functionally identical. */
+ TPMT_SYM_DEF_OBJECT object = {
+ .algorithm = parameters->algorithm,
+ .keyBits = parameters->keyBits,
+ .mode = parameters->mode,
+ };
+
+ return tpm2_supports_tpmt_sym_def_object(c, &object);
+}
+
+static Tpm2Context *tpm2_context_free(Tpm2Context *c) {
+ if (!c)
+ return NULL;
+
+ if (c->esys_context)
+ sym_Esys_Finalize(&c->esys_context);
+
+ c->tcti_context = mfree(c->tcti_context);
+ c->tcti_dl = safe_dlclose(c->tcti_dl);
+
+ c->capability_algorithms = mfree(c->capability_algorithms);
+ c->capability_commands = mfree(c->capability_commands);
+ c->capability_ecc_curves = mfree(c->capability_ecc_curves);
+
+ return mfree(c);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(Tpm2Context, tpm2_context, tpm2_context_free);
+
+static const TPMT_SYM_DEF SESSION_TEMPLATE_SYM_AES_128_CFB = {
+ .algorithm = TPM2_ALG_AES,
+ .keyBits.aes = 128,
+ .mode.aes = TPM2_ALG_CFB, /* The spec requires sessions to use CFB. */
+};
+
+int tpm2_context_new(const char *device, Tpm2Context **ret_context) {
+ _cleanup_(tpm2_context_unrefp) Tpm2Context *context = NULL;
+ TSS2_RC rc;
+ int r;
+
+ assert(ret_context);
+
+ context = new(Tpm2Context, 1);
+ if (!context)
+ return log_oom_debug();
+
+ *context = (Tpm2Context) {
+ .n_ref = 1,
+ };
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ if (!device) {
+ device = secure_getenv("SYSTEMD_TPM2_DEVICE");
+ if (device)
+ /* Setting the env var to an empty string forces tpm2-tss' own device picking
+ * logic to be used. */
+ device = empty_to_null(device);
+ else
+ /* If nothing was specified explicitly, we'll use a hardcoded default: the "device" tcti
+ * driver and the "/dev/tpmrm0" device. We do this since on some distributions the tpm2-abrmd
+ * might be used and we really don't want that, since it is a system service and that creates
+ * various ordering issues/deadlocks during early boot. */
+ device = "device:/dev/tpmrm0";
+ }
+
+ if (device) {
+ const char *param, *driver, *fn;
+ const TSS2_TCTI_INFO* info;
+ TSS2_TCTI_INFO_FUNC func;
+ size_t sz = 0;
+
+ param = strchr(device, ':');
+ if (param) {
+ /* Syntax #1: Pair of driver string and arbitrary parameter */
+ driver = strndupa_safe(device, param - device);
+ if (isempty(driver))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 driver name is empty, refusing.");
+
+ param++;
+ } else if (path_is_absolute(device) && path_is_valid(device)) {
+ /* Syntax #2: TPM device node */
+ driver = "device";
+ param = device;
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid TPM2 driver string, refusing.");
+
+ log_debug("Using TPM2 TCTI driver '%s' with device '%s'.", driver, param);
+
+ fn = strjoina("libtss2-tcti-", driver, ".so.0");
+
+ /* Better safe than sorry, let's refuse strings that cannot possibly be valid driver early, before going to disk. */
+ if (!filename_is_valid(fn))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 driver name '%s' not valid, refusing.", driver);
+
+ context->tcti_dl = dlopen(fn, RTLD_NOW);
+ if (!context->tcti_dl)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to load %s: %s", fn, dlerror());
+
+ func = dlsym(context->tcti_dl, TSS2_TCTI_INFO_SYMBOL);
+ if (!func)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to find TCTI info symbol " TSS2_TCTI_INFO_SYMBOL ": %s",
+ dlerror());
+
+ info = func();
+ if (!info)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Unable to get TCTI info data.");
+
+ log_debug("Loaded TCTI module '%s' (%s) [Version %" PRIu32 "]", info->name, info->description, info->version);
+
+ rc = info->init(NULL, &sz, NULL);
+ if (rc != TPM2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to initialize TCTI context: %s", sym_Tss2_RC_Decode(rc));
+
+ context->tcti_context = malloc0(sz);
+ if (!context->tcti_context)
+ return log_oom_debug();
+
+ rc = info->init(context->tcti_context, &sz, param);
+ if (rc != TPM2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to initialize TCTI context: %s", sym_Tss2_RC_Decode(rc));
+ }
+
+ rc = sym_Esys_Initialize(&context->esys_context, context->tcti_context, NULL);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to initialize TPM context: %s", sym_Tss2_RC_Decode(rc));
+
+ rc = sym_Esys_Startup(context->esys_context, TPM2_SU_CLEAR);
+ if (rc == TPM2_RC_INITIALIZE)
+ log_debug("TPM already started up.");
+ else if (rc == TSS2_RC_SUCCESS)
+ log_debug("TPM successfully started up.");
+ else
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to start up TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ r = tpm2_cache_capabilities(context);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to cache TPM capabilities: %m");
+
+ /* We require AES and CFB support for session encryption. */
+ if (!tpm2_supports_alg(context, TPM2_ALG_AES))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support AES.");
+
+ if (!tpm2_supports_alg(context, TPM2_ALG_CFB))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support CFB.");
+
+ if (!tpm2_supports_tpmt_sym_def(context, &SESSION_TEMPLATE_SYM_AES_128_CFB))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM does not support AES-128-CFB.");
+
+ *ret_context = TAKE_PTR(context);
+
+ return 0;
+}
+
+static void tpm2_handle_cleanup(ESYS_CONTEXT *esys_context, ESYS_TR esys_handle, bool flush) {
+ TSS2_RC rc;
+
+ if (!esys_context || esys_handle == ESYS_TR_NONE)
+ return;
+
+ /* Closing the handle removes its reference from the esys_context, but leaves the corresponding
+ * handle in the actual TPM. Flushing the handle removes its reference from the esys_context as well
+ * as removing its corresponding handle from the actual TPM. */
+ if (flush)
+ rc = sym_Esys_FlushContext(esys_context, esys_handle);
+ else
+ /* We can't use Esys_TR_Close() because the tpm2-tss library does not use reference counting
+ * for handles, and a single Esys_TR_Close() will remove the handle (internal to the tpm2-tss
+ * library) that might be in use by other code that is using the same ESYS_CONTEXT. This
+ * directly affects us; for example the src/test/test-tpm2.c test function
+ * check_seal_unseal() will encounter this issue and will result in a failure when trying to
+ * cleanup (i.e. Esys_FlushContext) the transient primary key that the test function
+ * generates. However, not calling Esys_TR_Close() here should be ok, since any leaked handle
+ * references will be cleaned up when we free our ESYS_CONTEXT.
+ *
+ * An upstream bug is open here: https://github.com/tpm2-software/tpm2-tss/issues/2693 */
+ rc = TSS2_RC_SUCCESS; // FIXME: restore sym_Esys_TR_Close() use once tpm2-tss is fixed and adopted widely enough
+ if (rc != TSS2_RC_SUCCESS)
+ /* We ignore failures here (besides debug logging), since this is called in error paths,
+ * where we cannot do anything about failures anymore. And when it is called in successful
+ * codepaths by this time we already did what we wanted to do, and got the results we wanted
+ * so there's no reason to make this fail more loudly than necessary. */
+ log_debug("Failed to %s TPM handle, ignoring: %s", flush ? "flush" : "close", sym_Tss2_RC_Decode(rc));
+}
+
+Tpm2Handle *tpm2_handle_free(Tpm2Handle *handle) {
+ if (!handle)
+ return NULL;
+
+ _cleanup_(tpm2_context_unrefp) Tpm2Context *context = (Tpm2Context*)handle->tpm2_context;
+ if (context)
+ tpm2_handle_cleanup(context->esys_context, handle->esys_handle, handle->flush);
+
+ return mfree(handle);
+}
+
+int tpm2_handle_new(Tpm2Context *context, Tpm2Handle **ret_handle) {
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+
+ assert(ret_handle);
+
+ handle = new(Tpm2Handle, 1);
+ if (!handle)
+ return log_oom_debug();
+
+ *handle = (Tpm2Handle) {
+ .tpm2_context = tpm2_context_ref(context),
+ .esys_handle = ESYS_TR_NONE,
+ .flush = true,
+ };
+
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+static int tpm2_read_public(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const Tpm2Handle *handle,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_NAME **ret_name,
+ TPM2B_NAME **ret_qname) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(handle);
+
+ rc = sym_Esys_ReadPublic(
+ c->esys_context,
+ handle->esys_handle,
+ session ? session->esys_handle : ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ret_public,
+ ret_name,
+ ret_qname);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to read public info: %s", sym_Tss2_RC_Decode(rc));
+
+ return 0;
+}
+
+/* Create a Tpm2Handle object that references a pre-existing handle in the TPM, at the handle index provided.
+ * This should be used only for persistent, transient, or NV handles; and the handle must already exist in
+ * the TPM at the specified handle index. The handle index should not be 0. Returns 1 if found, 0 if the
+ * index is empty, or < 0 on error. Also see tpm2_get_srk() below; the SRK is a commonly used persistent
+ * Tpm2Handle. */
+int tpm2_index_to_handle(
+ Tpm2Context *c,
+ TPM2_HANDLE index,
+ const Tpm2Handle *session,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_NAME **ret_name,
+ TPM2B_NAME **ret_qname,
+ Tpm2Handle **ret_handle) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+
+ /* Only allow persistent, transient, or NV index handle types. */
+ switch (TPM2_HANDLE_TYPE(index)) {
+ case TPM2_HT_PERSISTENT:
+ case TPM2_HT_NV_INDEX:
+ case TPM2_HT_TRANSIENT:
+ break;
+ case TPM2_HT_PCR:
+ /* PCR handles are referenced by their actual index number and do not need a Tpm2Handle */
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid handle 0x%08" PRIx32 " (in PCR range).", index);
+ case TPM2_HT_HMAC_SESSION:
+ case TPM2_HT_POLICY_SESSION:
+ /* Session indexes are only used internally by tpm2-tss (or lower code) */
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid handle 0x%08" PRIx32 " (in session range).", index);
+ case TPM2_HT_PERMANENT:
+ /* Permanent handles are defined, e.g. ESYS_TR_RH_OWNER. */
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid handle 0x%08" PRIx32 " (in permanent range).", index);
+ default:
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid handle 0x%08" PRIx32 " (in unknown range).", index);
+ }
+
+ /* For transient handles, the kernel tpm "resource manager" (i.e. /dev/tpmrm0) performs mapping
+ * which breaks GetCapability requests, so only check GetCapability if it's not a transient handle.
+ * https://bugzilla.kernel.org/show_bug.cgi?id=218009 */
+ if (TPM2_HANDLE_TYPE(index) != TPM2_HT_TRANSIENT) { // FIXME: once kernel bug is fixed, check transient handles too
+ r = tpm2_get_capability_handle(c, index);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ log_debug("TPM handle 0x%08" PRIx32 " not populated.", index);
+ if (ret_public)
+ *ret_public = NULL;
+ if (ret_name)
+ *ret_name = NULL;
+ if (ret_qname)
+ *ret_qname = NULL;
+ if (ret_handle)
+ *ret_handle = NULL;
+ return 0;
+ }
+ }
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_handle_new(c, &handle);
+ if (r < 0)
+ return r;
+
+ /* Since we didn't create this handle in the TPM (this is only creating an ESYS_TR handle for the
+ * pre-existing TPM handle), we shouldn't flush (or evict) it on cleanup. */
+ handle->flush = false;
+
+ rc = sym_Esys_TR_FromTPMPublic(
+ c->esys_context,
+ index,
+ session ? session->esys_handle : ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &handle->esys_handle);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to read public info: %s", sym_Tss2_RC_Decode(rc));
+
+ if (ret_public || ret_name || ret_qname) {
+ r = tpm2_read_public(c, session, handle, ret_public, ret_name, ret_qname);
+ if (r < 0)
+ return r;
+ }
+
+ if (ret_handle)
+ *ret_handle = TAKE_PTR(handle);
+
+ return 1;
+}
+
+/* Get the handle index for the provided Tpm2Handle. */
+int tpm2_index_from_handle(Tpm2Context *c, const Tpm2Handle *handle, TPM2_HANDLE *ret_index) {
+ TSS2_RC rc;
+
+ assert(c);
+ assert(handle);
+ assert(ret_index);
+
+ /* Esys_TR_GetTpmHandle was added to tpm2-tss in version 2.4.0. Once we can set a minimum tpm2-tss
+ * version of 2.4.0 this check can be removed. */
+ if (!sym_Esys_TR_GetTpmHandle)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "libtss2-esys too old, does not include Esys_TR_GetTpmHandle.");
+
+ rc = sym_Esys_TR_GetTpmHandle(c->esys_context, handle->esys_handle, ret_index);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to get handle index: %s", sym_Tss2_RC_Decode(rc));
+
+ return 0;
+}
+
+/* Copy an object in the TPM at a transient handle to a persistent handle.
+ *
+ * The provided transient handle must exist in the TPM in the transient range. The persistent handle may be 0
+ * or any handle in the persistent range. If 0, this will try each handle in the persistent range, in
+ * ascending order, until an available one is found. If non-zero, only the requested persistent handle will
+ * be used.
+ *
+ * Note that the persistent handle parameter is an handle index (i.e. number), while the transient handle is
+ * a Tpm2Handle object. The returned persistent handle will be a Tpm2Handle object that is located in the TPM
+ * at the requested persistent handle index (or the first available if none was requested).
+ *
+ * Returns 1 if the object was successfully persisted, or 0 if there is already a key at the requested
+ * handle, or < 0 on error. Theoretically, this would also return 0 if no specific persistent handle is
+ * requested but all persistent handles are used, but it is extremely unlikely the TPM has enough internal
+ * memory to store the entire persistent range, in which case an error will be returned if the TPM is out of
+ * memory for persistent storage. The persistent handle is only provided when returning 1. */
+static int tpm2_persist_handle(
+ Tpm2Context *c,
+ const Tpm2Handle *transient_handle,
+ const Tpm2Handle *session,
+ TPMI_DH_PERSISTENT persistent_handle_index,
+ Tpm2Handle **ret_persistent_handle) {
+
+ /* We don't use TPM2_PERSISTENT_FIRST and TPM2_PERSISTENT_LAST here due to:
+ * https://github.com/systemd/systemd/pull/27713#issuecomment-1591864753 */
+ TPMI_DH_PERSISTENT first = UINT32_C(0x81000000), last = UINT32_C(0x81ffffff);
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(transient_handle);
+
+ /* If persistent handle index specified, only try that. */
+ if (persistent_handle_index != 0) {
+ if (TPM2_HANDLE_TYPE(persistent_handle_index) != TPM2_HT_PERSISTENT)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Handle not in persistent range: 0x%x", persistent_handle_index);
+
+ first = last = persistent_handle_index;
+ }
+
+ for (TPMI_DH_PERSISTENT requested = first; requested <= last; requested++) {
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *persistent_handle = NULL;
+ r = tpm2_handle_new(c, &persistent_handle);
+ if (r < 0)
+ return r;
+
+ /* Since this is a persistent handle, don't flush it. */
+ persistent_handle->flush = false;
+
+ rc = sym_Esys_EvictControl(
+ c->esys_context,
+ ESYS_TR_RH_OWNER,
+ transient_handle->esys_handle,
+ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ requested,
+ &persistent_handle->esys_handle);
+ if (rc == TSS2_RC_SUCCESS) {
+ if (ret_persistent_handle)
+ *ret_persistent_handle = TAKE_PTR(persistent_handle);
+
+ return 1;
+ }
+ if (rc != TPM2_RC_NV_DEFINED)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to persist handle: %s", sym_Tss2_RC_Decode(rc));
+ }
+
+ if (ret_persistent_handle)
+ *ret_persistent_handle = NULL;
+
+ return 0;
+}
+
+#define TPM2_CREDIT_RANDOM_FLAG_PATH "/run/systemd/tpm-rng-credited"
+
+static int tpm2_credit_random(Tpm2Context *c) {
+ size_t rps, done = 0;
+ TSS2_RC rc;
+ usec_t t;
+ int r;
+
+ assert(c);
+
+ /* Pulls some entropy from the TPM and adds it into the kernel RNG pool. That way we can say that the
+ * key we will ultimately generate with the kernel random pool is at least as good as the TPM's RNG,
+ * but likely better. Note that we don't trust the TPM RNG very much, hence do not actually credit
+ * any entropy. */
+
+ if (access(TPM2_CREDIT_RANDOM_FLAG_PATH, F_OK) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Failed to detect if '" TPM2_CREDIT_RANDOM_FLAG_PATH "' exists, ignoring: %m");
+ } else {
+ log_debug("Not adding TPM2 entropy to the kernel random pool again.");
+ return 0; /* Already done */
+ }
+
+ t = now(CLOCK_MONOTONIC);
+
+ for (rps = random_pool_size(); rps > 0;) {
+ _cleanup_(Esys_Freep) TPM2B_DIGEST *buffer = NULL;
+
+ rc = sym_Esys_GetRandom(
+ c->esys_context,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ MIN(rps, 32U), /* 32 is supposedly a safe choice, given that AES 256bit keys are this long, and TPM2 baseline requires support for those. */
+ &buffer);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to acquire entropy from TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ if (buffer->size == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Zero-sized entropy returned from TPM.");
+
+ r = random_write_entropy(-1, buffer->buffer, buffer->size, /* credit= */ false);
+ if (r < 0)
+ return log_debug_errno(r, "Failed wo write entropy to kernel: %m");
+
+ done += buffer->size;
+ rps = LESS_BY(rps, buffer->size);
+ }
+
+ log_debug("Added %zu bytes of TPM2 entropy to the kernel random pool in %s.", done, FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - t, 0));
+
+ r = touch(TPM2_CREDIT_RANDOM_FLAG_PATH);
+ if (r < 0)
+ log_debug_errno(r, "Failed to touch '" TPM2_CREDIT_RANDOM_FLAG_PATH "', ignoring: %m");
+
+ return 0;
+}
+
+/* Get one of the legacy primary key templates.
+ *
+ * The legacy templates should only be used for older sealed data that did not use the SRK. Instead of a
+ * persistent SRK, a transient key was created to seal the data and then flushed; and the exact same template
+ * must be used to recreate the same transient key to unseal the data. The alg parameter must be TPM2_ALG_RSA
+ * or TPM2_ALG_ECC. This does not check if the alg is actually supported on this TPM. */
+static int tpm2_get_legacy_template(TPMI_ALG_PUBLIC alg, TPMT_PUBLIC *ret_template) {
+ /* Do not modify. */
+ static const TPMT_PUBLIC legacy_ecc = {
+ .type = TPM2_ALG_ECC,
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes =
+ TPMA_OBJECT_RESTRICTED|
+ TPMA_OBJECT_DECRYPT|
+ TPMA_OBJECT_FIXEDTPM|
+ TPMA_OBJECT_FIXEDPARENT|
+ TPMA_OBJECT_SENSITIVEDATAORIGIN|
+ TPMA_OBJECT_USERWITHAUTH,
+ .parameters.eccDetail = {
+ .symmetric = {
+ .algorithm = TPM2_ALG_AES,
+ .keyBits.aes = 128,
+ .mode.aes = TPM2_ALG_CFB,
+ },
+ .scheme.scheme = TPM2_ALG_NULL,
+ .curveID = TPM2_ECC_NIST_P256,
+ .kdf.scheme = TPM2_ALG_NULL,
+ },
+ };
+
+ /* Do not modify. */
+ static const TPMT_PUBLIC legacy_rsa = {
+ .type = TPM2_ALG_RSA,
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes = TPMA_OBJECT_RESTRICTED|TPMA_OBJECT_DECRYPT|TPMA_OBJECT_FIXEDTPM|TPMA_OBJECT_FIXEDPARENT|TPMA_OBJECT_SENSITIVEDATAORIGIN|TPMA_OBJECT_USERWITHAUTH,
+ .parameters.rsaDetail = {
+ .symmetric = {
+ .algorithm = TPM2_ALG_AES,
+ .keyBits.aes = 128,
+ .mode.aes = TPM2_ALG_CFB,
+ },
+ .scheme.scheme = TPM2_ALG_NULL,
+ .keyBits = 2048,
+ },
+ };
+
+ assert(ret_template);
+
+ if (alg == TPM2_ALG_ECC)
+ *ret_template = legacy_ecc;
+ else if (alg == TPM2_ALG_RSA)
+ *ret_template = legacy_rsa;
+ else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Unsupported legacy SRK alg: 0x%x", alg);
+
+ return 0;
+}
+
+/* Get a Storage Root Key (SRK) template.
+ *
+ * The SRK template values are recommended by the "TCG TPM v2.0 Provisioning Guidance" document in section
+ * 7.5.1 "Storage Primary Key (SRK) Templates", referencing "TCG EK Credential Profile for TPM Family 2.0".
+ * The EK Credential Profile version 2.0 provides only a single template each for RSA and ECC, while later EK
+ * Credential Profile versions provide more templates, and keep the original templates as "L-1" (for RSA) and
+ * "L-2" (for ECC).
+ *
+ * https://trustedcomputinggroup.org/resource/tcg-tpm-v2-0-provisioning-guidance
+ * https://trustedcomputinggroup.org/resource/http-trustedcomputinggroup-org-wp-content-uploads-tcg-ek-credential-profile
+ *
+ * These templates are only needed to create a new persistent SRK (or a new transient key that is
+ * SRK-compatible). Preferably, the TPM should contain a shared SRK located at the reserved shared SRK handle
+ * (see TPM2_SRK_HANDLE in tpm2-util.h, and tpm2_get_srk() below).
+ *
+ * Returns 0 if the specified algorithm is ECC or RSA, otherwise -EOPNOTSUPP. */
+int tpm2_get_srk_template(TPMI_ALG_PUBLIC alg, TPMT_PUBLIC *ret_template) {
+ /* The attributes are the same between ECC and RSA templates. This has the changes specified in the
+ * Provisioning Guidance document, specifically:
+ * TPMA_OBJECT_USERWITHAUTH is added.
+ * TPMA_OBJECT_ADMINWITHPOLICY is removed.
+ * TPMA_OBJECT_NODA is added. */
+ TPMA_OBJECT srk_attributes =
+ TPMA_OBJECT_DECRYPT |
+ TPMA_OBJECT_FIXEDPARENT |
+ TPMA_OBJECT_FIXEDTPM |
+ TPMA_OBJECT_NODA |
+ TPMA_OBJECT_RESTRICTED |
+ TPMA_OBJECT_SENSITIVEDATAORIGIN |
+ TPMA_OBJECT_USERWITHAUTH;
+
+ /* The symmetric configuration is the same between ECC and RSA templates. */
+ TPMT_SYM_DEF_OBJECT srk_symmetric = {
+ .algorithm = TPM2_ALG_AES,
+ .keyBits.aes = 128,
+ .mode.aes = TPM2_ALG_CFB,
+ };
+
+ /* Both templates have an empty authPolicy as specified by the Provisioning Guidance document. */
+
+ /* From the EK Credential Profile template "L-2". */
+ TPMT_PUBLIC srk_ecc = {
+ .type = TPM2_ALG_ECC,
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes = srk_attributes,
+ .parameters.eccDetail = {
+ .symmetric = srk_symmetric,
+ .scheme.scheme = TPM2_ALG_NULL,
+ .curveID = TPM2_ECC_NIST_P256,
+ .kdf.scheme = TPM2_ALG_NULL,
+ },
+ };
+
+ /* From the EK Credential Profile template "L-1". */
+ TPMT_PUBLIC srk_rsa = {
+ .type = TPM2_ALG_RSA,
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes = srk_attributes,
+ .parameters.rsaDetail = {
+ .symmetric = srk_symmetric,
+ .scheme.scheme = TPM2_ALG_NULL,
+ .keyBits = 2048,
+ },
+ };
+
+ assert(ret_template);
+
+ switch (alg) {
+ case TPM2_ALG_ECC:
+ *ret_template = srk_ecc;
+ return 0;
+ case TPM2_ALG_RSA:
+ *ret_template = srk_rsa;
+ return 0;
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "No SRK for algorithm 0x%" PRIx16, alg);
+}
+
+/* Get the best supported SRK template. ECC is preferred, then RSA. */
+int tpm2_get_best_srk_template(Tpm2Context *c, TPMT_PUBLIC *ret_template) {
+ TPMT_PUBLIC template;
+ int r;
+
+ assert(c);
+ assert(ret_template);
+
+ r = tpm2_get_srk_template(TPM2_ALG_ECC, &template);
+ if (r < 0)
+ return r;
+
+ if (!tpm2_supports_alg(c, TPM2_ALG_ECC))
+ log_debug("TPM does not support ECC.");
+ else if (!tpm2_supports_ecc_curve(c, template.parameters.eccDetail.curveID))
+ log_debug("TPM does not support ECC-NIST-P256 curve.");
+ else if (!tpm2_supports_tpmt_public(c, &template))
+ log_debug("TPM does not support SRK ECC template L-2.");
+ else {
+ *ret_template = template;
+ return 0;
+ }
+
+ r = tpm2_get_srk_template(TPM2_ALG_RSA, &template);
+ if (r < 0)
+ return r;
+
+ if (!tpm2_supports_alg(c, TPM2_ALG_RSA))
+ log_debug("TPM does not support RSA.");
+ else if (!tpm2_supports_tpmt_public(c, &template))
+ log_debug("TPM does not support SRK RSA template L-1.");
+ else {
+ *ret_template = template;
+ return 0;
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM does not support either SRK template L-1 (RSA) or L-2 (ECC).");
+}
+
+/* Get the SRK. Returns 1 if SRK is found, 0 if there is no SRK, or < 0 on error. Also see
+ * tpm2_get_or_create_srk() below. */
+int tpm2_get_srk(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_NAME **ret_name,
+ TPM2B_NAME **ret_qname,
+ Tpm2Handle **ret_handle) {
+
+ return tpm2_index_to_handle(c, TPM2_SRK_HANDLE, session, ret_public, ret_name, ret_qname, ret_handle);
+}
+
+/* Get the SRK, creating one if needed. Returns 1 if a new SRK was created and persisted, 0 if an SRK already
+ * exists, or < 0 on error. */
+int tpm2_get_or_create_srk(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_NAME **ret_name,
+ TPM2B_NAME **ret_qname,
+ Tpm2Handle **ret_handle) {
+
+ int r;
+
+ r = tpm2_get_srk(c, session, ret_public, ret_name, ret_qname, ret_handle);
+ if (r < 0)
+ return r;
+ if (r == 1)
+ return 0; /* 0 → SRK already set up */
+
+ /* No SRK, create and persist one */
+ TPM2B_PUBLIC template = {
+ .size = sizeof(TPMT_PUBLIC),
+ };
+ r = tpm2_get_best_srk_template(c, &template.publicArea);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get best SRK template: %m");
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *transient_handle = NULL;
+ r = tpm2_create_primary(
+ c,
+ session,
+ &template,
+ /* sensitive= */ NULL,
+ /* ret_public= */ NULL,
+ &transient_handle);
+ if (r < 0)
+ return r;
+
+ /* Try to persist the transient SRK we created. No locking needed; if multiple threads are trying to
+ * persist SRKs concurrently, only one will succeed (r == 1) while the rest will fail (r == 0). In
+ * either case, all threads will get the persistent SRK below. */
+ r = tpm2_persist_handle(c, transient_handle, session, TPM2_SRK_HANDLE, /* ret_persistent_handle= */ NULL);
+ if (r < 0)
+ return r;
+
+ /* The SRK should exist now. */
+ r = tpm2_get_srk(c, session, ret_public, ret_name, ret_qname, ret_handle);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ /* This should never happen. */
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "SRK we just persisted couldn't be found.");
+
+ return 1; /* > 0 → SRK newly set up */
+}
+
+/* Utility functions for TPMS_PCR_SELECTION. */
+
+/* Convert a TPMS_PCR_SELECTION object to a mask. */
+uint32_t tpm2_tpms_pcr_selection_to_mask(const TPMS_PCR_SELECTION *s) {
+ assert(s);
+ assert(s->sizeofSelect <= sizeof(s->pcrSelect));
+
+ uint32_t mask = 0;
+ for (unsigned i = 0; i < s->sizeofSelect; i++)
+ SET_FLAG(mask, (uint32_t)s->pcrSelect[i] << (i * 8), true);
+ return mask;
+}
+
+/* Convert a mask and hash alg to a TPMS_PCR_SELECTION object. */
+void tpm2_tpms_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash_alg, TPMS_PCR_SELECTION *ret) {
+ assert(ret);
+
+ /* This is currently hardcoded at 24 PCRs, above. */
+ if (!TPM2_PCR_MASK_VALID(mask))
+ log_debug("PCR mask selections (%x) out of range, ignoring.",
+ mask & ~((uint32_t)TPM2_PCRS_MASK));
+
+ *ret = (TPMS_PCR_SELECTION){
+ .hash = hash_alg,
+ .sizeofSelect = TPM2_PCRS_MAX / 8,
+ .pcrSelect[0] = mask & 0xff,
+ .pcrSelect[1] = (mask >> 8) & 0xff,
+ .pcrSelect[2] = (mask >> 16) & 0xff,
+ };
+}
+
+/* Test if all bits in the mask are set in the TPMS_PCR_SELECTION. */
+bool tpm2_tpms_pcr_selection_has_mask(const TPMS_PCR_SELECTION *s, uint32_t mask) {
+ assert(s);
+
+ return FLAGS_SET(tpm2_tpms_pcr_selection_to_mask(s), mask);
+}
+
+static void tpm2_tpms_pcr_selection_update_mask(TPMS_PCR_SELECTION *s, uint32_t mask, bool b) {
+ assert(s);
+
+ tpm2_tpms_pcr_selection_from_mask(UPDATE_FLAG(tpm2_tpms_pcr_selection_to_mask(s), mask, b), s->hash, s);
+}
+
+/* Add all PCR selections in the mask. */
+void tpm2_tpms_pcr_selection_add_mask(TPMS_PCR_SELECTION *s, uint32_t mask) {
+ tpm2_tpms_pcr_selection_update_mask(s, mask, 1);
+}
+
+/* Remove all PCR selections in the mask. */
+void tpm2_tpms_pcr_selection_sub_mask(TPMS_PCR_SELECTION *s, uint32_t mask) {
+ tpm2_tpms_pcr_selection_update_mask(s, mask, 0);
+}
+
+/* Add all PCR selections in 'b' to 'a'. Both must have the same hash alg. */
+void tpm2_tpms_pcr_selection_add(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b) {
+ assert(a);
+ assert(b);
+ assert(a->hash == b->hash);
+
+ tpm2_tpms_pcr_selection_add_mask(a, tpm2_tpms_pcr_selection_to_mask(b));
+}
+
+/* Remove all PCR selections in 'b' from 'a'. Both must have the same hash alg. */
+void tpm2_tpms_pcr_selection_sub(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b) {
+ assert(a);
+ assert(b);
+ assert(a->hash == b->hash);
+
+ tpm2_tpms_pcr_selection_sub_mask(a, tpm2_tpms_pcr_selection_to_mask(b));
+}
+
+/* Move all PCR selections in 'b' to 'a'. Both must have the same hash alg. */
+void tpm2_tpms_pcr_selection_move(TPMS_PCR_SELECTION *a, TPMS_PCR_SELECTION *b) {
+ if (a == b)
+ return;
+
+ tpm2_tpms_pcr_selection_add(a, b);
+ tpm2_tpms_pcr_selection_from_mask(0, b->hash, b);
+}
+
+#define FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml) \
+ _FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml, UNIQ_T(l, UNIQ))
+#define _FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml, l) \
+ for (typeof(tpml) (l) = (tpml); (l); (l) = NULL) \
+ FOREACH_ARRAY(tpms, (l)->pcrSelections, (l)->count)
+
+#define FOREACH_PCR_IN_TPMS_PCR_SELECTION(pcr, tpms) \
+ FOREACH_PCR_IN_MASK(pcr, tpm2_tpms_pcr_selection_to_mask(tpms))
+
+#define FOREACH_PCR_IN_TPML_PCR_SELECTION(pcr, tpms, tpml) \
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(tpms, tpml) \
+ FOREACH_PCR_IN_TPMS_PCR_SELECTION(pcr, tpms)
+
+char *tpm2_tpms_pcr_selection_to_string(const TPMS_PCR_SELECTION *s) {
+ assert(s);
+
+ const char *algstr = strna(tpm2_hash_alg_to_string(s->hash));
+
+ _cleanup_free_ char *mask = tpm2_pcr_mask_to_string(tpm2_tpms_pcr_selection_to_mask(s));
+ if (!mask)
+ return NULL;
+
+ return strjoin(algstr, "(", mask, ")");
+}
+
+size_t tpm2_tpms_pcr_selection_weight(const TPMS_PCR_SELECTION *s) {
+ assert(s);
+
+ return popcount(tpm2_tpms_pcr_selection_to_mask(s));
+}
+
+/* Utility functions for TPML_PCR_SELECTION. */
+
+/* Remove the (0-based) index entry from 'l', shift all following entries, and update the count. */
+static void tpm2_tpml_pcr_selection_remove_index(TPML_PCR_SELECTION *l, uint32_t index) {
+ assert(l);
+ assert(l->count <= ELEMENTSOF(l->pcrSelections));
+ assert(index < l->count);
+
+ size_t s = l->count - (index + 1);
+ memmove(&l->pcrSelections[index], &l->pcrSelections[index + 1], s * sizeof(l->pcrSelections[0]));
+ l->count--;
+}
+
+/* Get a TPMS_PCR_SELECTION from a TPML_PCR_SELECTION for the given hash alg. Returns NULL if there is no
+ * entry for the hash alg. This guarantees the returned entry contains all the PCR selections for the given
+ * hash alg, which may require modifying the TPML_PCR_SELECTION by removing duplicate entries. */
+static TPMS_PCR_SELECTION *tpm2_tpml_pcr_selection_get_tpms_pcr_selection(
+ TPML_PCR_SELECTION *l,
+ TPMI_ALG_HASH hash_alg) {
+
+ assert(l);
+ assert(l->count <= ELEMENTSOF(l->pcrSelections));
+
+ TPMS_PCR_SELECTION *selection = NULL;
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(s, l)
+ if (s->hash == hash_alg) {
+ selection = s;
+ break;
+ }
+
+ if (!selection)
+ return NULL;
+
+ /* Iterate backwards through the entries, removing any other entries for the hash alg. */
+ for (uint32_t i = l->count - 1; i > 0; i--) {
+ TPMS_PCR_SELECTION *s = &l->pcrSelections[i];
+
+ if (selection == s)
+ break;
+
+ if (s->hash == hash_alg) {
+ tpm2_tpms_pcr_selection_move(selection, s);
+ tpm2_tpml_pcr_selection_remove_index(l, i);
+ }
+ }
+
+ return selection;
+}
+
+/* Combine all duplicate (same hash alg) TPMS_PCR_SELECTION entries in 'l'. */
+static void tpm2_tpml_pcr_selection_cleanup(TPML_PCR_SELECTION *l) {
+ /* Can't use FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION() because we might modify l->count */
+ for (uint32_t i = 0; i < l->count; i++)
+ /* This removes all duplicate TPMS_PCR_SELECTION entries for this hash. */
+ (void) tpm2_tpml_pcr_selection_get_tpms_pcr_selection(l, l->pcrSelections[i].hash);
+}
+
+/* Convert a TPML_PCR_SELECTION object to a mask. Returns empty mask (i.e. 0) if 'hash_alg' is not in the object. */
+uint32_t tpm2_tpml_pcr_selection_to_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash_alg) {
+ assert(l);
+
+ /* Make a copy, as tpm2_tpml_pcr_selection_get_tpms_pcr_selection() will modify the object if there
+ * are multiple entries with the requested hash alg. */
+ TPML_PCR_SELECTION lcopy = *l;
+
+ TPMS_PCR_SELECTION *s;
+ s = tpm2_tpml_pcr_selection_get_tpms_pcr_selection(&lcopy, hash_alg);
+ if (!s)
+ return 0;
+
+ return tpm2_tpms_pcr_selection_to_mask(s);
+}
+
+/* Convert a mask and hash alg to a TPML_PCR_SELECTION object. */
+void tpm2_tpml_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash_alg, TPML_PCR_SELECTION *ret) {
+ assert(ret);
+
+ TPMS_PCR_SELECTION s;
+ tpm2_tpms_pcr_selection_from_mask(mask, hash_alg, &s);
+
+ *ret = (TPML_PCR_SELECTION){
+ .count = 1,
+ .pcrSelections[0] = s,
+ };
+}
+
+/* Add the PCR selections in 's' to the corresponding hash alg TPMS_PCR_SELECTION entry in 'l'. Adds a new
+ * TPMS_PCR_SELECTION entry for the hash alg if needed. This may modify the TPML_PCR_SELECTION by combining
+ * entries with the same hash alg. */
+void tpm2_tpml_pcr_selection_add_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s) {
+ assert(l);
+ assert(s);
+
+ if (tpm2_tpms_pcr_selection_is_empty(s))
+ return;
+
+ TPMS_PCR_SELECTION *selection = tpm2_tpml_pcr_selection_get_tpms_pcr_selection(l, s->hash);
+ if (selection) {
+ tpm2_tpms_pcr_selection_add(selection, s);
+ return;
+ }
+
+ /* It's already broken if the count is higher than the array has size for. */
+ assert(l->count <= ELEMENTSOF(l->pcrSelections));
+
+ /* If full, the cleanup should result in at least one available entry. */
+ if (l->count == ELEMENTSOF(l->pcrSelections))
+ tpm2_tpml_pcr_selection_cleanup(l);
+
+ assert(l->count < ELEMENTSOF(l->pcrSelections));
+ l->pcrSelections[l->count++] = *s;
+}
+
+/* Remove the PCR selections in 's' from the corresponding hash alg TPMS_PCR_SELECTION entry in 'l'. This
+ * will combine all entries for 's->hash' in 'l'. */
+void tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s) {
+ assert(l);
+ assert(s);
+
+ if (tpm2_tpms_pcr_selection_is_empty(s))
+ return;
+
+ TPMS_PCR_SELECTION *selection = tpm2_tpml_pcr_selection_get_tpms_pcr_selection(l, s->hash);
+ if (selection)
+ tpm2_tpms_pcr_selection_sub(selection, s);
+}
+
+/* Test if all bits in the mask for the hash are set in the TPML_PCR_SELECTION. */
+bool tpm2_tpml_pcr_selection_has_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask) {
+ assert(l);
+
+ return FLAGS_SET(tpm2_tpml_pcr_selection_to_mask(l, hash), mask);
+}
+
+/* Add the PCR selections in the mask, with the provided hash. */
+void tpm2_tpml_pcr_selection_add_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask) {
+ TPMS_PCR_SELECTION tpms;
+
+ assert(l);
+
+ tpm2_tpms_pcr_selection_from_mask(mask, hash, &tpms);
+ tpm2_tpml_pcr_selection_add_tpms_pcr_selection(l, &tpms);
+}
+
+/* Remove the PCR selections in the mask, with the provided hash. */
+void tpm2_tpml_pcr_selection_sub_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask) {
+ TPMS_PCR_SELECTION tpms;
+
+ assert(l);
+
+ tpm2_tpms_pcr_selection_from_mask(mask, hash, &tpms);
+ tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(l, &tpms);
+}
+
+/* Add all PCR selections in 'b' to 'a'. */
+void tpm2_tpml_pcr_selection_add(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b) {
+ assert(a);
+ assert(b);
+
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection_b, b)
+ tpm2_tpml_pcr_selection_add_tpms_pcr_selection(a, selection_b);
+}
+
+/* Remove all PCR selections in 'b' from 'a'. */
+void tpm2_tpml_pcr_selection_sub(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b) {
+ assert(a);
+ assert(b);
+
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection_b, b)
+ tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(a, selection_b);
+}
+
+char *tpm2_tpml_pcr_selection_to_string(const TPML_PCR_SELECTION *l) {
+ assert(l);
+
+ _cleanup_free_ char *banks = NULL;
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(s, l) {
+ if (tpm2_tpms_pcr_selection_is_empty(s))
+ continue;
+
+ _cleanup_free_ char *str = tpm2_tpms_pcr_selection_to_string(s);
+ if (!str || !strextend_with_separator(&banks, ",", str))
+ return NULL;
+ }
+
+ return strjoin("[", strempty(banks), "]");
+}
+
+size_t tpm2_tpml_pcr_selection_weight(const TPML_PCR_SELECTION *l) {
+ assert(l);
+ assert(l->count <= ELEMENTSOF(l->pcrSelections));
+
+ size_t weight = 0;
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(s, l) {
+ size_t w = tpm2_tpms_pcr_selection_weight(s);
+ assert(weight <= SIZE_MAX - w);
+ weight += w;
+ }
+
+ return weight;
+}
+
+bool tpm2_pcr_value_valid(const Tpm2PCRValue *pcr_value) {
+ int r;
+
+ if (!pcr_value)
+ return false;
+
+ if (!TPM2_PCR_INDEX_VALID(pcr_value->index)) {
+ log_debug("PCR index %u invalid.", pcr_value->index);
+ return false;
+ }
+
+ /* If it contains a value, the value size must match the hash size. */
+ if (pcr_value->value.size > 0) {
+ r = tpm2_hash_alg_to_size(pcr_value->hash);
+ if (r < 0)
+ return false;
+
+ if (pcr_value->value.size != (size_t) r) {
+ log_debug("PCR hash 0x%" PRIx16 " expected size %d does not match actual size %" PRIu16 ".",
+ pcr_value->hash, r, pcr_value->value.size);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Verify all entries are valid, and consistent with each other. The requirements for consistency are:
+ *
+ * 1) all entries must be sorted in ascending order (e.g. using tpm2_sort_pcr_values())
+ * 2) all entries must be unique, i.e. there cannot be 2 entries with the same hash and index
+ *
+ * Returns true if all entries are valid (or if no entries are provided), false otherwise.
+ */
+bool tpm2_pcr_values_valid(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ if (!pcr_values && n_pcr_values > 0)
+ return false;
+
+ const Tpm2PCRValue *previous = NULL;
+ FOREACH_ARRAY(current, pcr_values, n_pcr_values) {
+ if (!tpm2_pcr_value_valid(current))
+ return false;
+
+ if (!previous) {
+ previous = current;
+ continue;
+ }
+
+ /* Hashes must be sorted in ascending order */
+ if (current->hash < previous->hash) {
+ log_debug("PCR values not in ascending order, hash %" PRIu16 " is after %" PRIu16 ".",
+ current->hash, previous->hash);
+ return false;
+ }
+
+ if (current->hash == previous->hash) {
+ /* Indexes (for the same hash) must be sorted in ascending order */
+ if (current->index < previous->index) {
+ log_debug("PCR values not in ascending order, hash %" PRIu16 " index %u is after %u.",
+ current->hash, current->index, previous->index);
+ return false;
+ }
+
+ /* Indexes (for the same hash) must not be duplicates */
+ if (current->index == previous->index) {
+ log_debug("PCR values contain duplicates for hash %" PRIu16 " index %u.",
+ current->hash, previous->index);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/* Returns true if any of the provided PCR values has an actual hash value included, false otherwise. */
+bool tpm2_pcr_values_has_any_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ assert(pcr_values || n_pcr_values == 0);
+
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values)
+ if (v->value.size > 0)
+ return true;
+
+ return false;
+}
+
+/* Returns true if all of the provided PCR values has an actual hash value included, false otherwise. */
+bool tpm2_pcr_values_has_all_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ assert(pcr_values || n_pcr_values == 0);
+
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values)
+ if (v->value.size == 0)
+ return false;
+
+ return true;
+}
+
+static int cmp_pcr_values(const Tpm2PCRValue *a, const Tpm2PCRValue *b) {
+ assert(a);
+ assert(b);
+
+ return CMP(a->hash, b->hash) ?: CMP(a->index, b->index);
+}
+
+/* Sort the array of Tpm2PCRValue entries in-place. This sorts first in ascending order of hash algorithm
+ * (sorting simply by the TPM2 hash algorithm number), and then sorting by pcr index. */
+void tpm2_sort_pcr_values(Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ typesafe_qsort(pcr_values, n_pcr_values, cmp_pcr_values);
+}
+
+int tpm2_pcr_values_from_mask(uint32_t mask, TPMI_ALG_HASH hash, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values) {
+ _cleanup_free_ Tpm2PCRValue *pcr_values = NULL;
+ size_t n_pcr_values = 0;
+
+ assert(ret_pcr_values);
+ assert(ret_n_pcr_values);
+
+ FOREACH_PCR_IN_MASK(index, mask)
+ if (!GREEDY_REALLOC_APPEND(
+ pcr_values,
+ n_pcr_values,
+ &TPM2_PCR_VALUE_MAKE(index, hash, {}),
+ 1))
+ return log_oom_debug();
+
+ *ret_pcr_values = TAKE_PTR(pcr_values);
+ *ret_n_pcr_values = n_pcr_values;
+
+ return 0;
+}
+
+int tpm2_pcr_values_to_mask(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPMI_ALG_HASH hash, uint32_t *ret_mask) {
+ uint32_t mask = 0;
+
+ assert(pcr_values || n_pcr_values == 0);
+ assert(ret_mask);
+
+ if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid PCR values.");
+
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values)
+ if (v->hash == hash)
+ SET_BIT(mask, v->index);
+
+ *ret_mask = mask;
+
+ return 0;
+}
+
+int tpm2_tpml_pcr_selection_from_pcr_values(
+ const Tpm2PCRValue *pcr_values,
+ size_t n_pcr_values,
+ TPML_PCR_SELECTION *ret_selection,
+ TPM2B_DIGEST **ret_values,
+ size_t *ret_n_values) {
+
+ TPML_PCR_SELECTION selection = {};
+ _cleanup_free_ TPM2B_DIGEST *values = NULL;
+ size_t n_values = 0;
+
+ assert(pcr_values || n_pcr_values == 0);
+
+ if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "PCR values are not valid.");
+
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values) {
+ tpm2_tpml_pcr_selection_add_mask(&selection, v->hash, INDEX_TO_MASK(uint32_t, v->index));
+
+ if (!GREEDY_REALLOC_APPEND(values, n_values, &v->value, 1))
+ return log_oom_debug();
+ }
+
+ if (ret_selection)
+ *ret_selection = selection;
+ if (ret_values)
+ *ret_values = TAKE_PTR(values);
+ if (ret_n_values)
+ *ret_n_values = n_values;
+
+ return 0;
+}
+
+/* Count the number of different hash algorithms for all the entries. */
+int tpm2_pcr_values_hash_count(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, size_t *ret_count) {
+ TPML_PCR_SELECTION selection;
+ int r;
+
+ assert(pcr_values);
+ assert(ret_count);
+
+ r = tpm2_tpml_pcr_selection_from_pcr_values(
+ pcr_values,
+ n_pcr_values,
+ &selection,
+ /* ret_values= */ NULL,
+ /* ret_n_values= */ NULL);
+ if (r < 0)
+ return r;
+
+ *ret_count = selection.count;
+
+ return 0;
+}
+
+/* Parse a string argument into a Tpm2PCRValue object.
+ *
+ * The format is <index>[:hash[=value]] where index is the index number (or name) of the PCR, e.g. 0 (or
+ * platform-code), hash is the name of the hash algorithm (e.g. sha256) and value is the hex hash digest
+ * value, optionally with a leading 0x. This does not check for validity of the fields. */
+int tpm2_pcr_value_from_string(const char *arg, Tpm2PCRValue *ret_pcr_value) {
+ Tpm2PCRValue pcr_value = {};
+ const char *p = arg;
+ int r;
+
+ assert(arg);
+ assert(ret_pcr_value);
+
+ _cleanup_free_ char *index = NULL;
+ r = extract_first_word(&p, &index, ":", /* flags= */ 0);
+ if (r < 1)
+ return log_debug_errno(r, "Could not parse pcr value '%s': %m", p);
+
+ r = tpm2_pcr_index_from_string(index);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid pcr index '%s': %m", index);
+ pcr_value.index = (unsigned) r;
+
+ if (!isempty(p)) {
+ _cleanup_free_ char *hash = NULL;
+ r = extract_first_word(&p, &hash, "=", /* flags= */ 0);
+ if (r < 1)
+ return log_debug_errno(r, "Could not parse pcr hash algorithm '%s': %m", p);
+
+ r = tpm2_hash_alg_from_string(hash);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid pcr hash algorithm '%s': %m", hash);
+ pcr_value.hash = (TPMI_ALG_HASH) r;
+
+ if (!isempty(p)) {
+ /* Remove leading 0x if present */
+ p = startswith_no_case(p, "0x") ?: p;
+
+ _cleanup_free_ void *buf = NULL;
+ size_t buf_size = 0;
+ r = unhexmem(p, SIZE_MAX, &buf, &buf_size);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid pcr hash value '%s': %m", p);
+
+ r = TPM2B_DIGEST_CHECK_SIZE(buf_size);
+ if (r < 0)
+ return log_debug_errno(r, "PCR hash value size %zu too large.", buf_size);
+
+ pcr_value.value = TPM2B_DIGEST_MAKE(buf, buf_size);
+ }
+ }
+
+ *ret_pcr_value = pcr_value;
+
+ return 0;
+}
+
+/* Return a string for the PCR value. The format is described in tpm2_pcr_value_from_string(). Note that if
+ * the hash algorithm is not recognized, neither hash name nor hash digest value is included in the
+ * string. This does not check for validity. */
+char *tpm2_pcr_value_to_string(const Tpm2PCRValue *pcr_value) {
+ _cleanup_free_ char *index = NULL, *value = NULL;
+
+ if (asprintf(&index, "%u", pcr_value->index) < 0)
+ return NULL;
+
+ const char *hash = pcr_value->hash > 0 ? tpm2_hash_alg_to_string(pcr_value->hash) : NULL;
+
+ if (hash && pcr_value->value.size > 0) {
+ value = hexmem(pcr_value->value.buffer, pcr_value->value.size);
+ if (!value)
+ return NULL;
+ }
+
+ return strjoin(index, hash ? ":" : "", strempty(hash), value ? "=" : "", strempty(value));
+}
+
+/* Parse a string argument into an array of Tpm2PCRValue objects.
+ *
+ * The format is zero or more entries separated by ',' or '+'. The format of each entry is described in
+ * tpm2_pcr_value_from_string(). This does not check for validity of the entries. */
+int tpm2_pcr_values_from_string(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values) {
+ const char *p = arg;
+ int r;
+
+ assert(arg);
+ assert(ret_pcr_values);
+ assert(ret_n_pcr_values);
+
+ _cleanup_free_ Tpm2PCRValue *pcr_values = NULL;
+ size_t n_pcr_values = 0;
+
+ for (;;) {
+ _cleanup_free_ char *pcr_arg = NULL;
+ r = extract_first_word(&p, &pcr_arg, ",+", /* flags= */ 0);
+ if (r < 0)
+ return log_debug_errno(r, "Could not parse pcr values '%s': %m", p);
+ if (r == 0)
+ break;
+
+ Tpm2PCRValue pcr_value;
+ r = tpm2_pcr_value_from_string(pcr_arg, &pcr_value);
+ if (r < 0)
+ return r;
+
+ if (!GREEDY_REALLOC_APPEND(pcr_values, n_pcr_values, &pcr_value, 1))
+ return log_oom_debug();
+ }
+
+ *ret_pcr_values = TAKE_PTR(pcr_values);
+ *ret_n_pcr_values = n_pcr_values;
+
+ return 0;
+}
+
+/* Return a string representing the array of PCR values. The format is as described in
+ * tpm2_pcr_values_from_string(). This does not check for validity. */
+char *tpm2_pcr_values_to_string(const Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ _cleanup_free_ char *s = NULL;
+
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values) {
+ _cleanup_free_ char *pcrstr = tpm2_pcr_value_to_string(v);
+ if (!pcrstr || !strextend_with_separator(&s, "+", pcrstr))
+ return NULL;
+ }
+
+ return s ? TAKE_PTR(s) : strdup("");
+}
+
+void tpm2_log_debug_tpml_pcr_selection(const TPML_PCR_SELECTION *l, const char *msg) {
+ if (!DEBUG_LOGGING || !l)
+ return;
+
+ _cleanup_free_ char *s = tpm2_tpml_pcr_selection_to_string(l);
+ log_debug("%s: %s", msg ?: "PCR selection", strna(s));
+}
+
+void tpm2_log_debug_pcr_value(const Tpm2PCRValue *pcr_value, const char *msg) {
+ if (!DEBUG_LOGGING || !pcr_value)
+ return;
+
+ _cleanup_free_ char *s = tpm2_pcr_value_to_string(pcr_value);
+ log_debug("%s: %s", msg ?: "PCR value", strna(s));
+}
+
+void tpm2_log_debug_buffer(const void *buffer, size_t size, const char *msg) {
+ if (!DEBUG_LOGGING || !buffer || size == 0)
+ return;
+
+ _cleanup_free_ char *h = hexmem(buffer, size);
+ log_debug("%s: %s", msg ?: "Buffer", strna(h));
+}
+
+void tpm2_log_debug_digest(const TPM2B_DIGEST *digest, const char *msg) {
+ if (digest)
+ tpm2_log_debug_buffer(digest->buffer, digest->size, msg ?: "Digest");
+}
+
+void tpm2_log_debug_name(const TPM2B_NAME *name, const char *msg) {
+ if (name)
+ tpm2_log_debug_buffer(name->name, name->size, msg ?: "Name");
+}
+
+static int tpm2_get_policy_digest(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ TSS2_RC rc;
+
+ if (!DEBUG_LOGGING && !ret_policy_digest)
+ return 0;
+
+ assert(c);
+ assert(session);
+
+ log_debug("Acquiring policy digest.");
+
+ _cleanup_(Esys_Freep) TPM2B_DIGEST *policy_digest = NULL;
+ rc = sym_Esys_PolicyGetDigest(
+ c->esys_context,
+ session->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &policy_digest);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to get policy digest from TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ tpm2_log_debug_digest(policy_digest, "Session policy digest");
+
+ if (ret_policy_digest)
+ *ret_policy_digest = TAKE_PTR(policy_digest);
+
+ return 0;
+}
+
+int tpm2_create_primary(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const TPM2B_PUBLIC *template,
+ const TPM2B_SENSITIVE_CREATE *sensitive,
+ TPM2B_PUBLIC **ret_public,
+ Tpm2Handle **ret_handle) {
+
+ usec_t ts;
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(template);
+
+ log_debug("Creating primary key on TPM.");
+
+ ts = now(CLOCK_MONOTONIC);
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_handle_new(c, &handle);
+ if (r < 0)
+ return r;
+
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL;
+ rc = sym_Esys_CreatePrimary(
+ c->esys_context,
+ ESYS_TR_RH_OWNER,
+ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ sensitive ? sensitive : &(TPM2B_SENSITIVE_CREATE) {},
+ template,
+ /* outsideInfo= */ NULL,
+ &(TPML_PCR_SELECTION) {},
+ &handle->esys_handle,
+ &public,
+ /* creationData= */ NULL,
+ /* creationHash= */ NULL,
+ /* creationTicket= */ NULL);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to generate primary key in TPM: %s",
+ sym_Tss2_RC_Decode(rc));
+
+ log_debug("Successfully created primary key on TPM in %s.",
+ FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - ts, USEC_PER_MSEC));
+
+ if (ret_public)
+ *ret_public = TAKE_PTR(public);
+ if (ret_handle)
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+/* Create a TPM object. Do not use this to create primary keys, because some HW TPMs refuse to allow that;
+ * instead use tpm2_create_primary(). */
+int tpm2_create(Tpm2Context *c,
+ const Tpm2Handle *parent,
+ const Tpm2Handle *session,
+ const TPMT_PUBLIC *template,
+ const TPMS_SENSITIVE_CREATE *sensitive,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_PRIVATE **ret_private) {
+
+ usec_t ts;
+ TSS2_RC rc;
+
+ assert(c);
+ assert(parent);
+ assert(template);
+
+ log_debug("Creating object on TPM.");
+
+ ts = now(CLOCK_MONOTONIC);
+
+ TPM2B_PUBLIC tpm2b_public = {
+ .size = sizeof(*template) - sizeof(template->unique),
+ .publicArea = *template,
+ };
+
+ /* Zero the unique area. */
+ zero(tpm2b_public.publicArea.unique);
+
+ TPM2B_SENSITIVE_CREATE tpm2b_sensitive;
+ if (sensitive)
+ tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) {
+ .size = sizeof(*sensitive),
+ .sensitive = *sensitive,
+ };
+ else
+ tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) {};
+
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL;
+ _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL;
+ rc = sym_Esys_Create(
+ c->esys_context,
+ parent->esys_handle,
+ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &tpm2b_sensitive,
+ &tpm2b_public,
+ /* outsideInfo= */ NULL,
+ &(TPML_PCR_SELECTION) {},
+ &private,
+ &public,
+ /* creationData= */ NULL,
+ /* creationHash= */ NULL,
+ /* creationTicket= */ NULL);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to generate object in TPM: %s",
+ sym_Tss2_RC_Decode(rc));
+
+ log_debug("Successfully created object on TPM in %s.",
+ FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - ts, USEC_PER_MSEC));
+
+ if (ret_public)
+ *ret_public = TAKE_PTR(public);
+ if (ret_private)
+ *ret_private = TAKE_PTR(private);
+
+ return 0;
+}
+
+int tpm2_load(
+ Tpm2Context *c,
+ const Tpm2Handle *parent,
+ const Tpm2Handle *session,
+ const TPM2B_PUBLIC *public,
+ const TPM2B_PRIVATE *private,
+ Tpm2Handle **ret_handle) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(public);
+ assert(private);
+ assert(ret_handle);
+
+ log_debug("Loading object into TPM.");
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_handle_new(c, &handle);
+ if (r < 0)
+ return r;
+
+ rc = sym_Esys_Load(
+ c->esys_context,
+ parent ? parent->esys_handle : ESYS_TR_RH_OWNER,
+ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ private,
+ public,
+ &handle->esys_handle);
+ if (rc == TPM2_RC_LOCKOUT)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOLCK),
+ "TPM2 device is in dictionary attack lockout mode.");
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to load key into TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+static int tpm2_load_external(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const TPM2B_PUBLIC *public,
+ const TPM2B_SENSITIVE *private,
+ Tpm2Handle **ret_handle) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(ret_handle);
+
+ log_debug("Loading external key into TPM.");
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_handle_new(c, &handle);
+ if (r < 0)
+ return r;
+
+ rc = sym_Esys_LoadExternal(
+ c->esys_context,
+ session ? session->esys_handle : ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ private,
+ public,
+#if HAVE_TSS2_ESYS3
+ /* tpm2-tss >= 3.0.0 requires a ESYS_TR_RH_* constant specifying the requested
+ * hierarchy, older versions need TPM2_RH_* instead. */
+ ESYS_TR_RH_OWNER,
+#else
+ TPM2_RH_OWNER,
+#endif
+ &handle->esys_handle);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to load public key into TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+/* This calls TPM2_CreateLoaded() directly, without checking if the TPM supports it. Callers should instead
+ * use tpm2_create_loaded(). */
+static int _tpm2_create_loaded(
+ Tpm2Context *c,
+ const Tpm2Handle *parent,
+ const Tpm2Handle *session,
+ const TPMT_PUBLIC *template,
+ const TPMS_SENSITIVE_CREATE *sensitive,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_PRIVATE **ret_private,
+ Tpm2Handle **ret_handle) {
+
+ usec_t ts;
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(parent);
+ assert(template);
+
+ log_debug("Creating loaded object on TPM.");
+
+ ts = now(CLOCK_MONOTONIC);
+
+ /* Copy the input template and zero the unique area. */
+ TPMT_PUBLIC template_copy = *template;
+ zero(template_copy.unique);
+
+ TPM2B_TEMPLATE tpm2b_template;
+ size_t size = 0;
+ rc = sym_Tss2_MU_TPMT_PUBLIC_Marshal(
+ &template_copy,
+ tpm2b_template.buffer,
+ sizeof(tpm2b_template.buffer),
+ &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal public key template: %s", sym_Tss2_RC_Decode(rc));
+ assert(size <= UINT16_MAX);
+ tpm2b_template.size = size;
+
+ TPM2B_SENSITIVE_CREATE tpm2b_sensitive;
+ if (sensitive)
+ tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) {
+ .size = sizeof(*sensitive),
+ .sensitive = *sensitive,
+ };
+ else
+ tpm2b_sensitive = (TPM2B_SENSITIVE_CREATE) {};
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_handle_new(c, &handle);
+ if (r < 0)
+ return r;
+
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL;
+ _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL;
+ rc = sym_Esys_CreateLoaded(
+ c->esys_context,
+ parent->esys_handle,
+ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &tpm2b_sensitive,
+ &tpm2b_template,
+ &handle->esys_handle,
+ &private,
+ &public);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to generate loaded object in TPM: %s",
+ sym_Tss2_RC_Decode(rc));
+
+ log_debug("Successfully created loaded object on TPM in %s.",
+ FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - ts, USEC_PER_MSEC));
+
+ if (ret_public)
+ *ret_public = TAKE_PTR(public);
+ if (ret_private)
+ *ret_private = TAKE_PTR(private);
+ if (ret_handle)
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+/* This calls TPM2_CreateLoaded() if the TPM supports it, otherwise it calls TPM2_Create() and TPM2_Load()
+ * separately. Do not use this to create primary keys, because some HW TPMs refuse to allow that; instead use
+ * tpm2_create_primary(). */
+int tpm2_create_loaded(
+ Tpm2Context *c,
+ const Tpm2Handle *parent,
+ const Tpm2Handle *session,
+ const TPMT_PUBLIC *template,
+ const TPMS_SENSITIVE_CREATE *sensitive,
+ TPM2B_PUBLIC **ret_public,
+ TPM2B_PRIVATE **ret_private,
+ Tpm2Handle **ret_handle) {
+
+ int r;
+
+ if (tpm2_supports_command(c, TPM2_CC_CreateLoaded))
+ return _tpm2_create_loaded(c, parent, session, template, sensitive, ret_public, ret_private, ret_handle);
+
+ /* Unfortunately, this TPM doesn't support CreateLoaded (added at spec revision 130) so we need to
+ * create and load manually. */
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL;
+ _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL;
+ r = tpm2_create(c, parent, session, template, sensitive, &public, &private);
+ if (r < 0)
+ return r;
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_load(c, parent, session, public, private, &handle);
+ if (r < 0)
+ return r;
+
+ if (ret_public)
+ *ret_public = TAKE_PTR(public);
+ if (ret_private)
+ *ret_private = TAKE_PTR(private);
+ if (ret_handle)
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+static int tpm2_marshal_private(const TPM2B_PRIVATE *private, void **ret, size_t *ret_size) {
+ size_t max_size = sizeof(*private), blob_size = 0;
+ _cleanup_free_ void *blob = NULL;
+ TSS2_RC rc;
+
+ assert(private);
+ assert(ret);
+ assert(ret_size);
+
+ blob = malloc0(max_size);
+ if (!blob)
+ return log_oom_debug();
+
+ rc = sym_Tss2_MU_TPM2B_PRIVATE_Marshal(private, blob, max_size, &blob_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal private key: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret = TAKE_PTR(blob);
+ *ret_size = blob_size;
+ return 0;
+}
+
+static int tpm2_unmarshal_private(const void *data, size_t size, TPM2B_PRIVATE *ret_private) {
+ TPM2B_PRIVATE private = {};
+ size_t offset = 0;
+ TSS2_RC rc;
+
+ assert(data || size == 0);
+ assert(ret_private);
+
+ rc = sym_Tss2_MU_TPM2B_PRIVATE_Unmarshal(data, size, &offset, &private);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unmarshal private key: %s", sym_Tss2_RC_Decode(rc));
+ if (offset != size)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Garbage at end of private key marshal data.");
+
+ *ret_private = private;
+ return 0;
+}
+
+int tpm2_marshal_public(const TPM2B_PUBLIC *public, void **ret, size_t *ret_size) {
+ size_t max_size = sizeof(*public), blob_size = 0;
+ _cleanup_free_ void *blob = NULL;
+ TSS2_RC rc;
+
+ assert(public);
+ assert(ret);
+ assert(ret_size);
+
+ blob = malloc0(max_size);
+ if (!blob)
+ return log_oom_debug();
+
+ rc = sym_Tss2_MU_TPM2B_PUBLIC_Marshal(public, blob, max_size, &blob_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal public key: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret = TAKE_PTR(blob);
+ *ret_size = blob_size;
+ return 0;
+}
+
+static int tpm2_unmarshal_public(const void *data, size_t size, TPM2B_PUBLIC *ret_public) {
+ TPM2B_PUBLIC public = {};
+ size_t offset = 0;
+ TSS2_RC rc;
+
+ assert(data || size == 0);
+ assert(ret_public);
+
+ rc = sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal(data, size, &offset, &public);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unmarshal public key: %s", sym_Tss2_RC_Decode(rc));
+ if (offset != size)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Garbage at end of public key marshal data.");
+
+ *ret_public = public;
+ return 0;
+}
+
+int tpm2_marshal_nv_public(const TPM2B_NV_PUBLIC *nv_public, void **ret, size_t *ret_size) {
+ size_t max_size = sizeof(*nv_public), blob_size = 0;
+ _cleanup_free_ void *blob = NULL;
+ TSS2_RC rc;
+
+ assert(nv_public);
+ assert(ret);
+ assert(ret_size);
+
+ blob = malloc0(max_size);
+ if (!blob)
+ return log_oom_debug();
+
+ rc = sym_Tss2_MU_TPM2B_NV_PUBLIC_Marshal(nv_public, blob, max_size, &blob_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal NV public structure: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret = TAKE_PTR(blob);
+ *ret_size = blob_size;
+ return 0;
+}
+
+int tpm2_unmarshal_nv_public(const void *data, size_t size, TPM2B_NV_PUBLIC *ret_nv_public) {
+ TPM2B_NV_PUBLIC nv_public = {};
+ size_t offset = 0;
+ TSS2_RC rc;
+
+ assert(data || size == 0);
+ assert(ret_nv_public);
+
+ rc = sym_Tss2_MU_TPM2B_NV_PUBLIC_Unmarshal(data, size, &offset, &nv_public);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unmarshal NV public structure: %s", sym_Tss2_RC_Decode(rc));
+ if (offset != size)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Garbage at end of NV public structure marshal data.");
+
+ *ret_nv_public = nv_public;
+ return 0;
+}
+
+static int tpm2_import(
+ Tpm2Context *c,
+ const Tpm2Handle *parent,
+ const Tpm2Handle *session,
+ const TPM2B_PUBLIC *public,
+ const TPM2B_PRIVATE *private,
+ const TPM2B_ENCRYPTED_SECRET *seed,
+ const TPM2B_DATA *encryption_key,
+ const TPMT_SYM_DEF_OBJECT *symmetric,
+ TPM2B_PRIVATE **ret_private) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(parent);
+ assert(!!encryption_key == !!symmetric);
+ assert(public);
+ assert(private);
+ assert(seed);
+ assert(ret_private);
+
+ log_debug("Importing key into TPM.");
+
+ rc = sym_Esys_Import(
+ c->esys_context,
+ parent->esys_handle,
+ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ encryption_key,
+ public,
+ private,
+ seed,
+ symmetric ?: &(TPMT_SYM_DEF_OBJECT){ .algorithm = TPM2_ALG_NULL, },
+ ret_private);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to import key into TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ return 0;
+}
+
+/* Read hash values from the specified PCR selection. Provides a Tpm2PCRValue array that contains all
+ * requested PCR values, in the order provided by the TPM. Normally, the provided pcr values will match
+ * exactly what is in the provided selection, but the TPM may ignore some selected PCRs (for example, if an
+ * unimplemented PCR index is requested), in which case those PCRs will be absent from the provided pcr
+ * values. */
+int tpm2_pcr_read(
+ Tpm2Context *c,
+ const TPML_PCR_SELECTION *pcr_selection,
+ Tpm2PCRValue **ret_pcr_values,
+ size_t *ret_n_pcr_values) {
+
+ _cleanup_free_ Tpm2PCRValue *pcr_values = NULL;
+ size_t n_pcr_values = 0;
+ TSS2_RC rc;
+
+ assert(c);
+ assert(pcr_selection);
+ assert(ret_pcr_values);
+ assert(ret_n_pcr_values);
+
+ TPML_PCR_SELECTION remaining = *pcr_selection;
+ while (!tpm2_tpml_pcr_selection_is_empty(&remaining)) {
+ _cleanup_(Esys_Freep) TPML_PCR_SELECTION *current_read = NULL;
+ _cleanup_(Esys_Freep) TPML_DIGEST *current_values = NULL;
+
+ tpm2_log_debug_tpml_pcr_selection(&remaining, "Reading PCR selection");
+
+ /* Unfortunately, PCR_Read will not return more than 8 values. */
+ rc = sym_Esys_PCR_Read(
+ c->esys_context,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &remaining,
+ NULL,
+ &current_read,
+ &current_values);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to read TPM2 PCRs: %s", sym_Tss2_RC_Decode(rc));
+
+ tpm2_log_debug_tpml_pcr_selection(current_read, "Read PCR selection");
+
+ if (tpm2_tpml_pcr_selection_is_empty(current_read)) {
+ log_debug("TPM2 refused to read possibly unimplemented PCRs, ignoring.");
+ break;
+ }
+
+ unsigned i = 0;
+ FOREACH_PCR_IN_TPML_PCR_SELECTION(index, tpms, current_read) {
+ assert(i < current_values->count);
+ Tpm2PCRValue pcr_value = {
+ .index = index,
+ .hash = tpms->hash,
+ .value = current_values->digests[i++],
+ };
+
+ tpm2_log_debug_pcr_value(&pcr_value, /* msg= */ NULL);
+
+ if (!GREEDY_REALLOC_APPEND(pcr_values, n_pcr_values, &pcr_value, 1))
+ return log_oom_debug();
+ }
+ assert(i == current_values->count);
+
+ tpm2_tpml_pcr_selection_sub(&remaining, current_read);
+ }
+
+ tpm2_sort_pcr_values(pcr_values, n_pcr_values);
+
+ if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "PCR values read from TPM are not valid.");
+
+ *ret_pcr_values = TAKE_PTR(pcr_values);
+ *ret_n_pcr_values = n_pcr_values;
+
+ return 0;
+}
+
+/* Read the PCR value for each TPM2PCRValue entry in the array that does not have a value set. If all entries
+ * have an unset hash (i.e. hash == 0), this first detects the "best" PCR bank to use; otherwise, all entries
+ * must have a valid hash set. All entries must have a valid index. If this cannot read a PCR value for all
+ * appropriate entries, this returns an error. This does not check the array for validity. */
+int tpm2_pcr_read_missing_values(Tpm2Context *c, Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ TPMI_ALG_HASH pcr_bank = 0;
+ int r;
+
+ assert(c);
+ assert(pcr_values || n_pcr_values == 0);
+
+ if (n_pcr_values > 0) {
+ size_t hash_count;
+ r = tpm2_pcr_values_hash_count(pcr_values, n_pcr_values, &hash_count);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get hash count from pcr values: %m");
+
+ if (hash_count == 1 && pcr_values[0].hash == 0) {
+ uint32_t mask;
+ r = tpm2_pcr_values_to_mask(pcr_values, n_pcr_values, 0, &mask);
+ if (r < 0)
+ return r;
+
+ r = tpm2_get_best_pcr_bank(c, mask, &pcr_bank);
+ if (r < 0)
+ return r;
+ }
+ }
+
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values) {
+ if (v->hash == 0)
+ v->hash = pcr_bank;
+
+ if (v->value.size > 0)
+ continue;
+
+ TPML_PCR_SELECTION selection;
+ r = tpm2_tpml_pcr_selection_from_pcr_values(v, 1, &selection, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ Tpm2PCRValue *read_values = NULL;
+ size_t n_read_values;
+ r = tpm2_pcr_read(c, &selection, &read_values, &n_read_values);
+ if (r < 0)
+ return r;
+
+ if (n_read_values == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Could not read PCR hash 0x%" PRIu16 " index %u",
+ v->hash, v->index);
+
+ assert(n_read_values == 1);
+ assert(read_values[0].hash == v->hash);
+ assert(read_values[0].index == v->index);
+
+ v->value = read_values[0].value;
+ }
+
+ return 0;
+}
+
+static int tpm2_pcr_mask_good(
+ Tpm2Context *c,
+ TPMI_ALG_HASH bank,
+ uint32_t mask) {
+
+ TPML_PCR_SELECTION selection;
+ int r;
+
+ assert(c);
+
+ /* So we have the problem that some systems might have working TPM2 chips, but the firmware doesn't
+ * actually measure into them, or only into a suboptimal bank. If so, the PCRs should be all zero or
+ * all 0xFF. Detect that, so that we can warn and maybe pick a better bank. */
+
+ tpm2_tpml_pcr_selection_from_mask(mask, bank, &selection);
+
+ _cleanup_free_ Tpm2PCRValue *pcr_values = NULL;
+ size_t n_pcr_values;
+ r = tpm2_pcr_read(c, &selection, &pcr_values, &n_pcr_values);
+ if (r < 0)
+ return r;
+
+ /* If at least one of the selected PCR values is something other than all 0x00 or all 0xFF we are happy. */
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values)
+ if (!memeqbyte(0x00, v->value.buffer, v->value.size) &&
+ !memeqbyte(0xFF, v->value.buffer, v->value.size))
+ return true;
+
+ return false;
+}
+
+static int tpm2_bank_has24(const TPMS_PCR_SELECTION *selection) {
+
+ assert(selection);
+
+ /* As per https://trustedcomputinggroup.org/wp-content/uploads/TCG_PCClient_PFP_r1p05_v23_pub.pdf a
+ * TPM2 on a Client PC must have at least 24 PCRs. If this TPM has less, just skip over it. */
+ if (selection->sizeofSelect < TPM2_PCRS_MAX/8) {
+ log_debug("Skipping TPM2 PCR bank %s with fewer than 24 PCRs.",
+ strna(tpm2_hash_alg_to_string(selection->hash)));
+ return false;
+ }
+
+ assert_cc(TPM2_PCRS_MAX % 8 == 0);
+
+ /* It's not enough to check how many PCRs there are, we also need to check that the 24 are
+ * enabled for this bank. Otherwise this TPM doesn't qualify. */
+ bool valid = true;
+ for (size_t j = 0; j < TPM2_PCRS_MAX/8; j++)
+ if (selection->pcrSelect[j] != 0xFF) {
+ valid = false;
+ break;
+ }
+
+ if (!valid)
+ log_debug("TPM2 PCR bank %s has fewer than 24 PCR bits enabled, ignoring.",
+ strna(tpm2_hash_alg_to_string(selection->hash)));
+
+ return valid;
+}
+
+int tpm2_get_best_pcr_bank(
+ Tpm2Context *c,
+ uint32_t pcr_mask,
+ TPMI_ALG_HASH *ret) {
+
+ TPMI_ALG_HASH supported_hash = 0, hash_with_valid_pcr = 0;
+ int r;
+
+ assert(c);
+ assert(ret);
+
+ if (pcr_mask == 0) {
+ log_debug("Asked to pick best PCR bank but no PCRs selected we could derive this from. Defaulting to SHA256.");
+ *ret = TPM2_ALG_SHA256; /* if no PCRs are selected this doesn't matter anyway... */
+ return 0;
+ }
+
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection, &c->capability_pcrs) {
+ TPMI_ALG_HASH hash = selection->hash;
+ int good;
+
+ /* For now we are only interested in the SHA1 and SHA256 banks */
+ if (!IN_SET(hash, TPM2_ALG_SHA256, TPM2_ALG_SHA1))
+ continue;
+
+ r = tpm2_bank_has24(selection);
+ if (r < 0)
+ return r;
+ if (!r)
+ continue;
+
+ good = tpm2_pcr_mask_good(c, hash, pcr_mask);
+ if (good < 0)
+ return good;
+
+ if (hash == TPM2_ALG_SHA256) {
+ supported_hash = TPM2_ALG_SHA256;
+ if (good) {
+ /* Great, SHA256 is supported and has initialized PCR values, we are done. */
+ hash_with_valid_pcr = TPM2_ALG_SHA256;
+ break;
+ }
+ } else {
+ assert(hash == TPM2_ALG_SHA1);
+
+ if (supported_hash == 0)
+ supported_hash = TPM2_ALG_SHA1;
+
+ if (good && hash_with_valid_pcr == 0)
+ hash_with_valid_pcr = TPM2_ALG_SHA1;
+ }
+ }
+
+ /* We preferably pick SHA256, but only if its PCRs are initialized or neither the SHA1 nor the SHA256
+ * PCRs are initialized. If SHA256 is not supported but SHA1 is and its PCRs are too, we prefer
+ * SHA1.
+ *
+ * We log at LOG_NOTICE level whenever we end up using the SHA1 bank or when the PCRs we bind to are
+ * not initialized. */
+
+ if (hash_with_valid_pcr == TPM2_ALG_SHA256) {
+ assert(supported_hash == TPM2_ALG_SHA256);
+ log_debug("TPM2 device supports SHA256 PCR bank and SHA256 PCRs are valid, yay!");
+ *ret = TPM2_ALG_SHA256;
+ } else if (hash_with_valid_pcr == TPM2_ALG_SHA1) {
+ if (supported_hash == TPM2_ALG_SHA256)
+ log_notice("TPM2 device supports both SHA1 and SHA256 PCR banks, but only SHA1 PCRs are valid, falling back to SHA1 bank. This reduces the security level substantially.");
+ else {
+ assert(supported_hash == TPM2_ALG_SHA1);
+ log_notice("TPM2 device lacks support for SHA256 PCR bank, but SHA1 bank is supported and SHA1 PCRs are valid, falling back to SHA1 bank. This reduces the security level substantially.");
+ }
+
+ *ret = TPM2_ALG_SHA1;
+ } else if (supported_hash == TPM2_ALG_SHA256) {
+ log_notice("TPM2 device supports SHA256 PCR bank but none of the selected PCRs are valid! Firmware apparently did not initialize any of the selected PCRs. Proceeding anyway with SHA256 bank. PCR policy effectively unenforced!");
+ *ret = TPM2_ALG_SHA256;
+ } else if (supported_hash == TPM2_ALG_SHA1) {
+ log_notice("TPM2 device lacks support for SHA256 bank, but SHA1 bank is supported, but none of the selected PCRs are valid! Firmware apparently did not initialize any of the selected PCRs. Proceeding anyway with SHA1 bank. PCR policy effectively unenforced!");
+ *ret = TPM2_ALG_SHA1;
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM2 module supports neither SHA1 nor SHA256 PCR banks, cannot operate.");
+
+ return 0;
+}
+
+int tpm2_get_good_pcr_banks(
+ Tpm2Context *c,
+ uint32_t pcr_mask,
+ TPMI_ALG_HASH **ret) {
+
+ _cleanup_free_ TPMI_ALG_HASH *good_banks = NULL, *fallback_banks = NULL;
+ size_t n_good_banks = 0, n_fallback_banks = 0;
+ int r;
+
+ assert(c);
+ assert(ret);
+
+ FOREACH_TPMS_PCR_SELECTION_IN_TPML_PCR_SELECTION(selection, &c->capability_pcrs) {
+ TPMI_ALG_HASH hash = selection->hash;
+
+ /* Let's see if this bank is superficially OK, i.e. has at least 24 enabled registers */
+ r = tpm2_bank_has24(selection);
+ if (r < 0)
+ return r;
+ if (!r)
+ continue;
+
+ /* Let's now see if this bank has any of the selected PCRs actually initialized */
+ r = tpm2_pcr_mask_good(c, hash, pcr_mask);
+ if (r < 0)
+ return r;
+
+ if (n_good_banks + n_fallback_banks >= INT_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "Too many good TPM2 banks?");
+
+ if (r) {
+ if (!GREEDY_REALLOC(good_banks, n_good_banks+1))
+ return log_oom_debug();
+
+ good_banks[n_good_banks++] = hash;
+ } else {
+ if (!GREEDY_REALLOC(fallback_banks, n_fallback_banks+1))
+ return log_oom_debug();
+
+ fallback_banks[n_fallback_banks++] = hash;
+ }
+ }
+
+ /* Preferably, use the good banks (i.e. the ones the PCR values are actually initialized so
+ * far). Otherwise use the fallback banks (i.e. which exist and are enabled, but so far not used. */
+ if (n_good_banks > 0) {
+ log_debug("Found %zu fully initialized TPM2 banks.", n_good_banks);
+ *ret = TAKE_PTR(good_banks);
+ return (int) n_good_banks;
+ }
+ if (n_fallback_banks > 0) {
+ log_debug("Found %zu enabled but un-initialized TPM2 banks.", n_fallback_banks);
+ *ret = TAKE_PTR(fallback_banks);
+ return (int) n_fallback_banks;
+ }
+
+ /* No suitable banks found. */
+ *ret = NULL;
+ return 0;
+}
+
+int tpm2_get_good_pcr_banks_strv(
+ Tpm2Context *c,
+ uint32_t pcr_mask,
+ char ***ret) {
+
+#if HAVE_OPENSSL
+ _cleanup_free_ TPMI_ALG_HASH *algs = NULL;
+ _cleanup_strv_free_ char **l = NULL;
+ int n_algs;
+
+ assert(c);
+ assert(ret);
+
+ n_algs = tpm2_get_good_pcr_banks(c, pcr_mask, &algs);
+ if (n_algs < 0)
+ return n_algs;
+
+ FOREACH_ARRAY(a, algs, n_algs) {
+ _cleanup_free_ char *n = NULL;
+ const EVP_MD *implementation;
+ const char *salg;
+
+ salg = tpm2_hash_alg_to_string(*a);
+ if (!salg)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM2 operates with unknown PCR algorithm, can't measure.");
+
+ implementation = EVP_get_digestbyname(salg);
+ if (!implementation)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "TPM2 operates with unsupported PCR algorithm, can't measure.");
+
+ n = strdup(ASSERT_PTR(EVP_MD_name(implementation)));
+ if (!n)
+ return log_oom_debug();
+
+ ascii_strlower(n); /* OpenSSL uses uppercase digest names, we prefer them lower case. */
+
+ if (strv_consume(&l, TAKE_PTR(n)) < 0)
+ return log_oom_debug();
+ }
+
+ *ret = TAKE_PTR(l);
+ return 0;
+#else /* HAVE_OPENSSL */
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled.");
+#endif
+}
+
+/* Hash data into the digest.
+ *
+ * If 'extend' is true, the hashing operation starts with the existing digest hash (and the digest is
+ * required to have a hash and its size must be correct). If 'extend' is false, the digest size is
+ * initialized to the correct size for 'alg' and the hashing operation does not include any existing digest
+ * hash. If 'extend' is false and no data is provided, the digest is initialized to a zero digest.
+ *
+ * On success, the digest hash will be updated with the hashing operation result and the digest size will be
+ * correct for 'alg'.
+ *
+ * This currently only provides SHA256, so 'alg' must be TPM2_ALG_SHA256. */
+int tpm2_digest_many(
+ TPMI_ALG_HASH alg,
+ TPM2B_DIGEST *digest,
+ const struct iovec data[],
+ size_t n_data,
+ bool extend) {
+
+ struct sha256_ctx ctx;
+
+ assert(digest);
+ assert(data || n_data == 0);
+
+ if (alg != TPM2_ALG_SHA256)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Hash algorithm not supported: 0x%x", alg);
+
+ if (extend && digest->size != SHA256_DIGEST_SIZE)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Digest size 0x%x, require 0x%x",
+ digest->size, (unsigned)SHA256_DIGEST_SIZE);
+
+ /* Since we're hardcoding SHA256 (for now), we can check this at compile time. */
+ assert_cc(sizeof(digest->buffer) >= SHA256_DIGEST_SIZE);
+
+ CLEANUP_ERASE(ctx);
+
+ sha256_init_ctx(&ctx);
+
+ if (extend)
+ sha256_process_bytes(digest->buffer, digest->size, &ctx);
+ else {
+ *digest = (TPM2B_DIGEST) {
+ .size = SHA256_DIGEST_SIZE,
+ };
+ if (n_data == 0) /* If not extending and no data, return zero hash */
+ return 0;
+ }
+
+ FOREACH_ARRAY(d, data, n_data)
+ sha256_process_bytes(d->iov_base, d->iov_len, &ctx);
+
+ sha256_finish_ctx(&ctx, digest->buffer);
+
+ return 0;
+}
+
+/* Same as tpm2_digest_many() but data is contained in TPM2B_DIGEST[]. The digests may be any size digests. */
+int tpm2_digest_many_digests(
+ TPMI_ALG_HASH alg,
+ TPM2B_DIGEST *digest,
+ const TPM2B_DIGEST data[],
+ size_t n_data,
+ bool extend) {
+
+ _cleanup_free_ struct iovec *iovecs = NULL;
+
+ assert(data || n_data == 0);
+
+ iovecs = new(struct iovec, n_data);
+ if (!iovecs)
+ return log_oom_debug();
+
+ for (size_t i = 0; i < n_data; i++)
+ iovecs[i] = IOVEC_MAKE((void*) data[i].buffer, data[i].size);
+
+ return tpm2_digest_many(alg, digest, iovecs, n_data, extend);
+}
+
+/* This hashes the provided pin into a digest value, but also verifies that the final byte is not 0, because
+ * the TPM specification Part 1 ("Architecture") section Authorization Values (subsection "Authorization Size
+ * Convention") states "Trailing octets of zero are to be removed from any string before it is used as an
+ * authValue". Since the TPM doesn't know if the auth value is a "string" or just a hash digest, any hash
+ * digest that randomly happens to end in 0 must have the final 0(s) trimmed.
+ *
+ * This is required at 2 points. First, when setting the authValue during creation of new sealed objects, in
+ * tpm2_seal(). This only applies to newly created objects, of course. Second, when using a previously
+ * created sealed object that has an authValue set, we use the sealed objects as the session bind key. This
+ * requires calling SetAuth so tpm2-tss can correctly calculate the HMAC to use for the encryption session.
+ *
+ * TPM implementations will perform the trimming for any authValue for existing sealed objects, so the
+ * tpm2-tss library must also perform the trimming before HMAC calculation, but it does not yet; this bug is
+ * open to add the trimming: https://github.com/tpm2-software/tpm2-tss/issues/2664
+ *
+ * Until our minimum tpm2-tss version contains a fix for that bug, we must perform the trimming
+ * ourselves. Note that since we are trimming, which is exactly what a TPM implementation would do, this will
+ * work for both existing objects with a authValue ending in 0(s) as well as new sealed objects we create,
+ * which we will trim the 0(s) from before sending to the TPM.
+ */
+static void tpm2_trim_auth_value(TPM2B_AUTH *auth) {
+ bool trimmed = false;
+
+ assert(auth);
+
+ while (auth->size > 0 && auth->buffer[auth->size - 1] == 0) {
+ trimmed = true;
+ auth->size--;
+ }
+
+ if (trimmed)
+ log_debug("authValue ends in 0, trimming as required by the TPM2 specification Part 1 section 'HMAC Computation' authValue Note 2.");
+}
+
+int tpm2_get_pin_auth(TPMI_ALG_HASH hash, const char *pin, TPM2B_AUTH *ret_auth) {
+ TPM2B_AUTH auth = {};
+ int r;
+
+ assert(pin);
+ assert(ret_auth);
+
+ r = tpm2_digest_buffer(hash, &auth, pin, strlen(pin), /* extend= */ false);
+ if (r < 0)
+ return r;
+
+ tpm2_trim_auth_value(&auth);
+
+ *ret_auth = TAKE_STRUCT(auth);
+
+ return 0;
+}
+
+int tpm2_set_auth_binary(Tpm2Context *c, const Tpm2Handle *handle, const TPM2B_AUTH *auth) {
+ TSS2_RC rc;
+
+ assert(c);
+ assert(handle);
+
+ if (!auth)
+ return 0;
+
+ rc = sym_Esys_TR_SetAuth(c->esys_context, handle->esys_handle, auth);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to load PIN in TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ return 0;
+}
+
+int tpm2_set_auth(Tpm2Context *c, const Tpm2Handle *handle, const char *pin) {
+ TPM2B_AUTH auth = {};
+ int r;
+
+ assert(c);
+ assert(handle);
+
+ if (!pin)
+ return 0;
+
+ CLEANUP_ERASE(auth);
+
+ r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &auth);
+ if (r < 0)
+ return r;
+
+ return tpm2_set_auth_binary(c, handle, &auth);
+}
+
+static bool tpm2_is_encryption_session(Tpm2Context *c, const Tpm2Handle *session) {
+ TPMA_SESSION flags = 0;
+ TSS2_RC rc;
+
+ assert(c);
+ assert(session);
+
+ rc = sym_Esys_TRSess_GetAttributes(c->esys_context, session->esys_handle, &flags);
+ if (rc != TSS2_RC_SUCCESS)
+ return false;
+
+ return (flags & TPMA_SESSION_DECRYPT) && (flags & TPMA_SESSION_ENCRYPT);
+}
+
+int tpm2_make_encryption_session(
+ Tpm2Context *c,
+ const Tpm2Handle *primary,
+ const Tpm2Handle *bind_key,
+ Tpm2Handle **ret_session) {
+
+ const TPMA_SESSION sessionAttributes = TPMA_SESSION_DECRYPT | TPMA_SESSION_ENCRYPT |
+ TPMA_SESSION_CONTINUESESSION;
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(primary);
+ assert(ret_session);
+
+ log_debug("Starting HMAC encryption session.");
+
+ /* Start a salted, unbound HMAC session with a well-known key (e.g. primary key) as tpmKey, which
+ * means that the random salt will be encrypted with the well-known key. That way, only the TPM can
+ * recover the salt, which is then used for key derivation. */
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *session = NULL;
+ r = tpm2_handle_new(c, &session);
+ if (r < 0)
+ return r;
+
+ rc = sym_Esys_StartAuthSession(
+ c->esys_context,
+ primary->esys_handle,
+ bind_key ? bind_key->esys_handle : ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ NULL,
+ TPM2_SE_HMAC,
+ &SESSION_TEMPLATE_SYM_AES_128_CFB,
+ TPM2_ALG_SHA256,
+ &session->esys_handle);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to open session in TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ /* Enable parameter encryption/decryption with AES in CFB mode. Together with HMAC digests (which are
+ * always used for sessions), this provides confidentiality, integrity and replay protection for
+ * operations that use this session. */
+ rc = sym_Esys_TRSess_SetAttributes(c->esys_context, session->esys_handle, sessionAttributes, 0xff);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to configure TPM session: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_session = TAKE_PTR(session);
+
+ return 0;
+}
+
+int tpm2_make_policy_session(
+ Tpm2Context *c,
+ const Tpm2Handle *primary,
+ const Tpm2Handle *encryption_session,
+ Tpm2Handle **ret_session) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(primary);
+ assert(encryption_session);
+ assert(ret_session);
+
+ if (!tpm2_is_encryption_session(c, encryption_session))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Missing encryption session");
+
+ log_debug("Starting policy session.");
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *session = NULL;
+ r = tpm2_handle_new(c, &session);
+ if (r < 0)
+ return r;
+
+ rc = sym_Esys_StartAuthSession(
+ c->esys_context,
+ primary->esys_handle,
+ ESYS_TR_NONE,
+ encryption_session->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ NULL,
+ TPM2_SE_POLICY,
+ &SESSION_TEMPLATE_SYM_AES_128_CFB,
+ TPM2_ALG_SHA256,
+ &session->esys_handle);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to open session in TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_session = TAKE_PTR(session);
+
+ return 0;
+}
+
+static int find_signature(
+ JsonVariant *v,
+ const TPML_PCR_SELECTION *pcr_selection,
+ const void *fp,
+ size_t fp_size,
+ const void *policy,
+ size_t policy_size,
+ void *ret_signature,
+ size_t *ret_signature_size) {
+
+#if HAVE_OPENSSL
+ JsonVariant *b, *i;
+ const char *k;
+ int r;
+
+ /* Searches for a signature blob in the specified JSON object. Search keys are PCR bank, PCR mask,
+ * public key, and policy digest. */
+
+ if (!json_variant_is_object(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Signature is not a JSON object.");
+
+ uint16_t pcr_bank = pcr_selection->pcrSelections[0].hash;
+ uint32_t pcr_mask = tpm2_tpml_pcr_selection_to_mask(pcr_selection, pcr_bank);
+
+ k = tpm2_hash_alg_to_string(pcr_bank);
+ if (!k)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Don't know PCR bank %" PRIu16, pcr_bank);
+
+ /* First, find field by bank */
+ b = json_variant_by_key(v, k);
+ if (!b)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), "Signature lacks data for PCR bank '%s'.", k);
+
+ if (!json_variant_is_array(b))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Bank data is not a JSON array.");
+
+ /* Now iterate through all signatures known for this bank */
+ JSON_VARIANT_ARRAY_FOREACH(i, b) {
+ _cleanup_free_ void *fpj_data = NULL, *polj_data = NULL;
+ JsonVariant *maskj, *fpj, *sigj, *polj;
+ size_t fpj_size, polj_size;
+ uint32_t parsed_mask;
+
+ if (!json_variant_is_object(i))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Bank data element is not a JSON object");
+
+ /* Check if the PCR mask matches our expectations */
+ maskj = json_variant_by_key(i, "pcrs");
+ if (!maskj)
+ continue;
+
+ r = tpm2_parse_pcr_json_array(maskj, &parsed_mask);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse JSON PCR mask");
+
+ if (parsed_mask != pcr_mask)
+ continue; /* Not for this PCR mask */
+
+ /* Then check if this is for the public key we operate with */
+ fpj = json_variant_by_key(i, "pkfp");
+ if (!fpj)
+ continue;
+
+ r = json_variant_unhex(fpj, &fpj_data, &fpj_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decode fingerprint in JSON data: %m");
+
+ if (memcmp_nn(fp, fp_size, fpj_data, fpj_size) != 0)
+ continue; /* Not for this public key */
+
+ /* Finally, check if this is for the PCR policy we expect this to be */
+ polj = json_variant_by_key(i, "pol");
+ if (!polj)
+ continue;
+
+ r = json_variant_unhex(polj, &polj_data, &polj_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decode policy hash JSON data: %m");
+
+ if (memcmp_nn(policy, policy_size, polj_data, polj_size) != 0)
+ continue;
+
+ /* This entry matches all our expectations, now return the signature included in it */
+ sigj = json_variant_by_key(i, "sig");
+ if (!sigj)
+ continue;
+
+ return json_variant_unbase64(sigj, ret_signature, ret_signature_size);
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(ENXIO), "Couldn't find signature for this PCR bank, PCR index and public key.");
+#else /* HAVE_OPENSSL */
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled.");
+#endif
+}
+
+/* Calculates the "name" of a public key.
+ *
+ * As specified in TPM2 spec "Part 1: Architecture", a key's "name" is its nameAlg value followed by a hash
+ * of its TPM2 public area, all properly marshalled. This allows a key's "name" to be dependent not only on
+ * the key fingerprint, but also on the TPM2-specific fields that associated with the key (i.e. all fields in
+ * TPMT_PUBLIC). Note that this means an existing key may not change any of its TPMT_PUBLIC fields, since
+ * that would also change the key name.
+ *
+ * Since we (currently) hardcode to always using SHA256 for hashing, this returns an error if the public key
+ * nameAlg is not TPM2_ALG_SHA256. */
+int tpm2_calculate_pubkey_name(const TPMT_PUBLIC *public, TPM2B_NAME *ret_name) {
+ TSS2_RC rc;
+ int r;
+
+ assert(public);
+ assert(ret_name);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ if (public->nameAlg != TPM2_ALG_SHA256)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Unsupported nameAlg: 0x%x",
+ public->nameAlg);
+
+ _cleanup_free_ uint8_t *buf = NULL;
+ size_t size = 0;
+
+ buf = (uint8_t*) new(TPMT_PUBLIC, 1);
+ if (!buf)
+ return log_oom_debug();
+
+ rc = sym_Tss2_MU_TPMT_PUBLIC_Marshal(public, buf, sizeof(TPMT_PUBLIC), &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal public key: %s", sym_Tss2_RC_Decode(rc));
+
+ TPM2B_DIGEST digest = {};
+ r = tpm2_digest_buffer(TPM2_ALG_SHA256, &digest, buf, size, /* extend= */ false);
+ if (r < 0)
+ return r;
+
+ TPMT_HA ha = {
+ .hashAlg = TPM2_ALG_SHA256,
+ };
+ assert(digest.size <= sizeof(ha.digest.sha256));
+ memcpy_safe(ha.digest.sha256, digest.buffer, digest.size);
+
+ TPM2B_NAME name;
+ size = 0;
+ rc = sym_Tss2_MU_TPMT_HA_Marshal(&ha, name.name, sizeof(name.name), &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal key name: %s", sym_Tss2_RC_Decode(rc));
+ name.size = size;
+
+ tpm2_log_debug_name(&name, "Calculated public key name");
+
+ *ret_name = name;
+
+ return 0;
+}
+
+/* Get the "name" of a key from the TPM.
+ *
+ * The "name" of a key is explained above in tpm2_calculate_pubkey_name().
+ *
+ * The handle must reference a key already present in the TPM. It may be either a public key only, or a
+ * public/private keypair. */
+static int tpm2_get_name(
+ Tpm2Context *c,
+ const Tpm2Handle *handle,
+ TPM2B_NAME **ret_name) {
+
+ _cleanup_(Esys_Freep) TPM2B_NAME *name = NULL;
+ TSS2_RC rc;
+
+ assert(c);
+ assert(handle);
+ assert(ret_name);
+
+ rc = sym_Esys_TR_GetName(c->esys_context, handle->esys_handle, &name);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to get name of public key from TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ tpm2_log_debug_name(name, "Object name");
+
+ *ret_name = TAKE_PTR(name);
+
+ return 0;
+}
+
+int tpm2_calculate_nv_index_name(const TPMS_NV_PUBLIC *nvpublic, TPM2B_NAME *ret_name) {
+ TSS2_RC rc;
+ int r;
+
+ assert(nvpublic);
+ assert(ret_name);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ if (nvpublic->nameAlg != TPM2_ALG_SHA256)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Unsupported nameAlg: 0x%x",
+ nvpublic->nameAlg);
+
+ _cleanup_free_ uint8_t *buf = NULL;
+ size_t size = 0;
+
+ buf = (uint8_t*) new(TPMS_NV_PUBLIC, 1);
+ if (!buf)
+ return log_oom_debug();
+
+ rc = sym_Tss2_MU_TPMS_NV_PUBLIC_Marshal(nvpublic, buf, sizeof(TPMS_NV_PUBLIC), &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal NV index: %s", sym_Tss2_RC_Decode(rc));
+
+ TPM2B_DIGEST digest = {};
+ r = tpm2_digest_buffer(TPM2_ALG_SHA256, &digest, buf, size, /* extend= */ false);
+ if (r < 0)
+ return r;
+
+ TPMT_HA ha = {
+ .hashAlg = TPM2_ALG_SHA256,
+ };
+ assert(digest.size <= sizeof(ha.digest.sha256));
+ memcpy_safe(ha.digest.sha256, digest.buffer, digest.size);
+
+ TPM2B_NAME name;
+ size = 0;
+ rc = sym_Tss2_MU_TPMT_HA_Marshal(&ha, name.name, sizeof(name.name), &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal NV index name: %s", sym_Tss2_RC_Decode(rc));
+ name.size = size;
+
+ tpm2_log_debug_name(&name, "Calculated NV index name");
+
+ *ret_name = name;
+
+ return 0;
+}
+
+/* Extend 'digest' with the PolicyAuthValue calculated hash. */
+int tpm2_calculate_policy_auth_value(TPM2B_DIGEST *digest) {
+ TPM2_CC command = TPM2_CC_PolicyAuthValue;
+ TSS2_RC rc;
+ int r;
+
+ assert(digest);
+ assert(digest->size == SHA256_DIGEST_SIZE);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ uint8_t buf[sizeof(command)];
+ size_t offset = 0;
+
+ rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal PolicyAuthValue command: %s", sym_Tss2_RC_Decode(rc));
+
+ if (offset != sizeof(command))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Offset 0x%zx wrong after marshalling PolicyAuthValue command", offset);
+
+ r = tpm2_digest_buffer(TPM2_ALG_SHA256, digest, buf, offset, /* extend= */ true);
+ if (r < 0)
+ return r;
+
+ tpm2_log_debug_digest(digest, "PolicyAuthValue calculated digest");
+
+ return 0;
+}
+
+int tpm2_policy_auth_value(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(session);
+
+ log_debug("Submitting AuthValue policy.");
+
+ rc = sym_Esys_PolicyAuthValue(
+ c->esys_context,
+ session->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to add authValue policy to TPM: %s",
+ sym_Tss2_RC_Decode(rc));
+
+ return tpm2_get_policy_digest(c, session, ret_policy_digest);
+}
+
+int tpm2_calculate_policy_authorize_nv(
+ const TPM2B_NV_PUBLIC *public_info,
+ TPM2B_DIGEST *digest) {
+ TPM2_CC command = TPM2_CC_PolicyAuthorizeNV;
+ TSS2_RC rc;
+ int r;
+
+ assert(public_info);
+ assert(digest);
+ assert(digest->size == SHA256_DIGEST_SIZE);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ uint8_t buf[sizeof(command)];
+ size_t offset = 0;
+
+ rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal PolicyAuthorizeNV command: %s", sym_Tss2_RC_Decode(rc));
+
+ if (offset != sizeof(command))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Offset 0x%zx wrong after marshalling PolicyAuthorizeNV command", offset);
+
+ TPM2B_NV_PUBLIC public_info_copy = *public_info; /* Make a copy, since we must set TPMA_NV_WRITTEN for the calculation */
+ public_info_copy.nvPublic.attributes |= TPMA_NV_WRITTEN;
+
+ TPM2B_NAME name = {};
+ r = tpm2_calculate_nv_index_name(&public_info_copy.nvPublic, &name);
+ if (r < 0)
+ return r;
+
+ struct iovec data[] = {
+ IOVEC_MAKE(buf, offset),
+ IOVEC_MAKE(name.name, name.size),
+ };
+
+ r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, ELEMENTSOF(data), /* extend= */ true);
+ if (r < 0)
+ return r;
+
+ tpm2_log_debug_digest(digest, "PolicyAuthorizeNV calculated digest");
+
+ return 0;
+}
+
+int tpm2_policy_authorize_nv(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const Tpm2Handle *nv_handle,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(session);
+
+ log_debug("Submitting AuthorizeNV policy.");
+
+ rc = sym_Esys_PolicyAuthorizeNV(
+ c->esys_context,
+ ESYS_TR_RH_OWNER,
+ nv_handle->esys_handle,
+ session->esys_handle,
+ ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to add AuthorizeNV policy to TPM: %s",
+ sym_Tss2_RC_Decode(rc));
+
+ return tpm2_get_policy_digest(c, session, ret_policy_digest);
+}
+
+int tpm2_policy_or(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const TPM2B_DIGEST *branches, size_t n_branches,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ TPML_DIGEST hash_list;
+ TSS2_RC rc;
+
+ assert(c);
+ assert(session);
+
+ if (n_branches > ELEMENTSOF(hash_list.digests))
+ return -EOPNOTSUPP;
+
+ log_debug("Submitting OR policy.");
+
+ hash_list = (TPML_DIGEST) {
+ .count = n_branches,
+ };
+
+ memcpy(hash_list.digests, branches, n_branches * sizeof(TPM2B_DIGEST));
+
+ if (DEBUG_LOGGING)
+ for (size_t i = 0; i < hash_list.count; i++) {
+ _cleanup_free_ char *h = hexmem(hash_list.digests[i].buffer, hash_list.digests[i].size);
+ log_debug("Submitting OR Branch #%zu: %s", i, h);
+ }
+
+ rc = sym_Esys_PolicyOR(
+ c->esys_context,
+ session->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &hash_list);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to add OR policy to TPM: %s",
+ sym_Tss2_RC_Decode(rc));
+
+ return tpm2_get_policy_digest(c, session, ret_policy_digest);
+}
+
+/* Extend 'digest' with the PolicyOR calculated hash. */
+int tpm2_calculate_policy_or(const TPM2B_DIGEST *branches, size_t n_branches, TPM2B_DIGEST *digest) {
+ TPM2_CC command = TPM2_CC_PolicyOR;
+ TSS2_RC rc;
+ int r;
+
+ assert(digest);
+ assert(digest->size == SHA256_DIGEST_SIZE);
+
+ if (n_branches == 0)
+ return -EINVAL;
+ if (n_branches == 1)
+ log_warning("PolicyOR with a single branch submitted, this is weird.");
+ if (n_branches > 8)
+ return -E2BIG;
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_error_errno(r, "TPM2 support not installed: %m");
+
+ uint8_t buf[sizeof(command)];
+ size_t offset = 0;
+
+ rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal PolicyOR command: %s", sym_Tss2_RC_Decode(rc));
+
+ if (offset != sizeof(command))
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Offset 0x%zx wrong after marshalling PolicyOR command", offset);
+ _cleanup_free_ struct iovec *data = new(struct iovec, 1 + n_branches);
+ if (!data)
+ return log_oom();
+
+ data[0] = IOVEC_MAKE(buf, offset);
+ for (size_t i = 0; i < n_branches; i++) {
+ data[1 + i] = IOVEC_MAKE((void*) branches[i].buffer, branches[i].size);
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *h = hexmem(branches[i].buffer, branches[i].size);
+ log_debug("OR Branch #%zu: %s", i, h);
+ }
+ }
+
+ /* PolicyOR does not use the previous hash value; we must zero and then extend it. */
+ zero(digest->buffer);
+
+ r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, 1 + n_branches, /* extend= */ true);
+ if (r < 0)
+ return r;
+
+ tpm2_log_debug_digest(digest, "PolicyOR calculated digest");
+
+ return 0;
+}
+
+/* Extend 'digest' with the PolicyPCR calculated hash. */
+int tpm2_calculate_policy_pcr(
+ const Tpm2PCRValue *pcr_values,
+ size_t n_pcr_values,
+ TPM2B_DIGEST *digest) {
+
+ TPM2_CC command = TPM2_CC_PolicyPCR;
+ TSS2_RC rc;
+ int r;
+
+ assert(pcr_values || n_pcr_values == 0);
+ assert(digest);
+ assert(digest->size == SHA256_DIGEST_SIZE);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ TPML_PCR_SELECTION pcr_selection;
+ _cleanup_free_ TPM2B_DIGEST *values = NULL;
+ size_t n_values;
+ r = tpm2_tpml_pcr_selection_from_pcr_values(pcr_values, n_pcr_values, &pcr_selection, &values, &n_values);
+ if (r < 0)
+ return log_debug_errno(r, "Could not convert PCR values to TPML_PCR_SELECTION: %m");
+
+ TPM2B_DIGEST hash = {};
+ r = tpm2_digest_many_digests(TPM2_ALG_SHA256, &hash, values, n_values, /* extend= */ false);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ uint8_t *buf = NULL;
+ size_t size = 0, maxsize = sizeof(command) + sizeof(pcr_selection);
+
+ buf = malloc(maxsize);
+ if (!buf)
+ return log_oom_debug();
+
+ rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, maxsize, &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal PolicyPCR command: %s", sym_Tss2_RC_Decode(rc));
+
+ rc = sym_Tss2_MU_TPML_PCR_SELECTION_Marshal(&pcr_selection, buf, maxsize, &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal PCR selection: %s", sym_Tss2_RC_Decode(rc));
+
+ struct iovec data[] = {
+ IOVEC_MAKE(buf, size),
+ IOVEC_MAKE(hash.buffer, hash.size),
+ };
+ r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, ELEMENTSOF(data), /* extend= */ true);
+ if (r < 0)
+ return r;
+
+ tpm2_log_debug_digest(digest, "PolicyPCR calculated digest");
+
+ return 0;
+}
+
+int tpm2_policy_pcr(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const TPML_PCR_SELECTION *pcr_selection,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(session);
+ assert(pcr_selection);
+
+ log_debug("Submitting PCR hash policy.");
+
+ rc = sym_Esys_PolicyPCR(
+ c->esys_context,
+ session->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ NULL,
+ pcr_selection);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to add PCR policy to TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ return tpm2_get_policy_digest(c, session, ret_policy_digest);
+}
+
+/* Extend 'digest' with the PolicyAuthorize calculated hash. */
+int tpm2_calculate_policy_authorize(
+ const TPM2B_PUBLIC *public,
+ const TPM2B_DIGEST *policy_ref,
+ TPM2B_DIGEST *digest) {
+
+ TPM2_CC command = TPM2_CC_PolicyAuthorize;
+ TSS2_RC rc;
+ int r;
+
+ assert(public);
+ assert(digest);
+ assert(digest->size == SHA256_DIGEST_SIZE);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ uint8_t buf[sizeof(command)];
+ size_t offset = 0;
+
+ rc = sym_Tss2_MU_TPM2_CC_Marshal(command, buf, sizeof(buf), &offset);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal PolicyAuthorize command: %s", sym_Tss2_RC_Decode(rc));
+
+ if (offset != sizeof(command))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Offset 0x%zx wrong after marshalling PolicyAuthorize command", offset);
+
+ TPM2B_NAME name = {};
+ r = tpm2_calculate_pubkey_name(&public->publicArea, &name);
+ if (r < 0)
+ return r;
+
+ /* PolicyAuthorize does not use the previous hash value; we must zero and then extend it. */
+ zero(digest->buffer);
+
+ struct iovec data[] = {
+ IOVEC_MAKE(buf, offset),
+ IOVEC_MAKE(name.name, name.size),
+ };
+ r = tpm2_digest_many(TPM2_ALG_SHA256, digest, data, ELEMENTSOF(data), /* extend= */ true);
+ if (r < 0)
+ return r;
+
+ /* PolicyAuthorize requires hashing twice; this is either an extension or rehashing. */
+ if (policy_ref)
+ r = tpm2_digest_many_digests(TPM2_ALG_SHA256, digest, policy_ref, 1, /* extend= */ true);
+ else
+ r = tpm2_digest_rehash(TPM2_ALG_SHA256, digest);
+ if (r < 0)
+ return r;
+
+ tpm2_log_debug_digest(digest, "PolicyAuthorize calculated digest");
+
+ return 0;
+}
+
+static int tpm2_policy_authorize(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPML_PCR_SELECTION *pcr_selection,
+ const TPM2B_PUBLIC *public,
+ const void *fp,
+ size_t fp_size,
+ JsonVariant *signature_json,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(session);
+ assert(pcr_selection);
+ assert(public);
+ assert(fp && fp_size > 0);
+
+ log_debug("Adding PCR signature policy.");
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *pubkey_handle = NULL;
+ r = tpm2_load_external(c, NULL, public, NULL, &pubkey_handle);
+ if (r < 0)
+ return r;
+
+ /* Acquire the "name" of what we just loaded */
+ _cleanup_(Esys_Freep) TPM2B_NAME *pubkey_name = NULL;
+ r = tpm2_get_name(c, pubkey_handle, &pubkey_name);
+ if (r < 0)
+ return r;
+
+ /* If we have a signature, proceed with verifying the PCR digest */
+ const TPMT_TK_VERIFIED *check_ticket;
+ _cleanup_(Esys_Freep) TPMT_TK_VERIFIED *check_ticket_buffer = NULL;
+ _cleanup_(Esys_Freep) TPM2B_DIGEST *approved_policy = NULL;
+ if (signature_json) {
+ r = tpm2_policy_pcr(
+ c,
+ session,
+ pcr_selection,
+ &approved_policy);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ void *signature_raw = NULL;
+ size_t signature_size;
+
+ r = find_signature(
+ signature_json,
+ pcr_selection,
+ fp, fp_size,
+ approved_policy->buffer,
+ approved_policy->size,
+ &signature_raw,
+ &signature_size);
+ if (r < 0)
+ return r;
+
+ /* TPM2_VerifySignature() will only verify the RSA part of the RSA+SHA256 signature,
+ * hence we need to do the SHA256 part ourselves, first */
+ TPM2B_DIGEST signature_hash = *approved_policy;
+ r = tpm2_digest_rehash(TPM2_ALG_SHA256, &signature_hash);
+ if (r < 0)
+ return r;
+
+ r = TPM2B_PUBLIC_KEY_RSA_CHECK_SIZE(signature_size);
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Signature larger than buffer.");
+
+ TPMT_SIGNATURE policy_signature = {
+ .sigAlg = TPM2_ALG_RSASSA,
+ .signature.rsassa = {
+ .hash = TPM2_ALG_SHA256,
+ .sig = TPM2B_PUBLIC_KEY_RSA_MAKE(signature_raw, signature_size),
+ },
+ };
+
+ rc = sym_Esys_VerifySignature(
+ c->esys_context,
+ pubkey_handle->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &signature_hash,
+ &policy_signature,
+ &check_ticket_buffer);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to validate signature in TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ check_ticket = check_ticket_buffer;
+ } else {
+ /* When enrolling, we pass a NULL ticket */
+ static const TPMT_TK_VERIFIED check_ticket_null = {
+ .tag = TPM2_ST_VERIFIED,
+ .hierarchy = TPM2_RH_OWNER,
+ };
+
+ check_ticket = &check_ticket_null;
+ }
+
+ rc = sym_Esys_PolicyAuthorize(
+ c->esys_context,
+ session->esys_handle,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ approved_policy,
+ /* policyRef= */ &(const TPM2B_NONCE) {},
+ pubkey_name,
+ check_ticket);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to push Authorize policy into TPM: %s", sym_Tss2_RC_Decode(rc));
+
+ return tpm2_get_policy_digest(c, session, ret_policy_digest);
+}
+
+/* Extend 'digest' with the calculated policy hash. */
+int tpm2_calculate_sealing_policy(
+ const Tpm2PCRValue *pcr_values,
+ size_t n_pcr_values,
+ const TPM2B_PUBLIC *public,
+ bool use_pin,
+ const Tpm2PCRLockPolicy *pcrlock_policy,
+ TPM2B_DIGEST *digest) {
+
+ int r;
+
+ assert(pcr_values || n_pcr_values == 0);
+ assert(digest);
+
+ if (public && pcrlock_policy)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Policies with both signed PCR and pcrlock are currently not supported.");
+
+ if (public) {
+ r = tpm2_calculate_policy_authorize(public, NULL, digest);
+ if (r < 0)
+ return r;
+ }
+
+ if (pcrlock_policy) {
+ TPM2B_NV_PUBLIC nv_public;
+
+ r = tpm2_unmarshal_nv_public(
+ pcrlock_policy->nv_public.iov_base,
+ pcrlock_policy->nv_public.iov_len,
+ &nv_public);
+ if (r < 0)
+ return r;
+
+ r = tpm2_calculate_policy_authorize_nv(&nv_public, digest);
+ if (r < 0)
+ return r;
+ }
+
+ if (n_pcr_values > 0) {
+ r = tpm2_calculate_policy_pcr(pcr_values, n_pcr_values, digest);
+ if (r < 0)
+ return r;
+ }
+
+ if (use_pin) {
+ r = tpm2_calculate_policy_auth_value(digest);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int tpm2_build_sealing_policy(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ uint32_t hash_pcr_mask,
+ uint16_t pcr_bank,
+ const TPM2B_PUBLIC *public,
+ const void *fp,
+ size_t fp_size,
+ uint32_t pubkey_pcr_mask,
+ JsonVariant *signature_json,
+ bool use_pin,
+ const Tpm2PCRLockPolicy *pcrlock_policy,
+ TPM2B_DIGEST **ret_policy_digest) {
+
+ int r;
+
+ assert(c);
+ assert(session);
+ assert(pubkey_pcr_mask == 0 || public);
+
+ log_debug("Building sealing policy.");
+
+ if ((hash_pcr_mask | pubkey_pcr_mask) != 0) {
+ r = tpm2_pcr_mask_good(c, pcr_bank, hash_pcr_mask|pubkey_pcr_mask);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ log_debug("Selected TPM2 PCRs are not initialized on this system.");
+ }
+
+ if (pubkey_pcr_mask != 0 && pcrlock_policy)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Policies with both signed PCR and pcrlock are currently not supported.");
+
+ if (pubkey_pcr_mask != 0) {
+ TPML_PCR_SELECTION pcr_selection;
+ tpm2_tpml_pcr_selection_from_mask(pubkey_pcr_mask, (TPMI_ALG_HASH)pcr_bank, &pcr_selection);
+ r = tpm2_policy_authorize(c, session, &pcr_selection, public, fp, fp_size, signature_json, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (pcrlock_policy) {
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *nv_handle = NULL;
+
+ r = tpm2_policy_super_pcr(
+ c,
+ session,
+ &pcrlock_policy->prediction,
+ pcrlock_policy->algorithm);
+ if (r < 0)
+ return r;
+
+ r = tpm2_deserialize(
+ c,
+ pcrlock_policy->nv_handle.iov_base,
+ pcrlock_policy->nv_handle.iov_len,
+ &nv_handle);
+ if (r < 0)
+ return r;
+
+ r = tpm2_policy_authorize_nv(
+ c,
+ session,
+ nv_handle,
+ NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (hash_pcr_mask != 0) {
+ TPML_PCR_SELECTION pcr_selection;
+ tpm2_tpml_pcr_selection_from_mask(hash_pcr_mask, (TPMI_ALG_HASH)pcr_bank, &pcr_selection);
+ r = tpm2_policy_pcr(c, session, &pcr_selection, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ if (use_pin) {
+ r = tpm2_policy_auth_value(c, session, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = tpm2_get_policy_digest(c, session, ret_policy_digest);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+#if HAVE_OPENSSL
+static const struct {
+ TPM2_ECC_CURVE tpm2_ecc_curve_id;
+ int openssl_ecc_curve_id;
+} tpm2_openssl_ecc_curve_table[] = {
+ { TPM2_ECC_NIST_P192, NID_X9_62_prime192v1, },
+ { TPM2_ECC_NIST_P224, NID_secp224r1, },
+ { TPM2_ECC_NIST_P256, NID_X9_62_prime256v1, },
+ { TPM2_ECC_NIST_P384, NID_secp384r1, },
+ { TPM2_ECC_NIST_P521, NID_secp521r1, },
+ { TPM2_ECC_SM2_P256, NID_sm2, },
+};
+
+static int tpm2_ecc_curve_from_openssl_curve_id(int openssl_ecc_curve_id, TPM2_ECC_CURVE *ret) {
+ assert(ret);
+
+ FOREACH_ARRAY(t, tpm2_openssl_ecc_curve_table, ELEMENTSOF(tpm2_openssl_ecc_curve_table))
+ if (t->openssl_ecc_curve_id == openssl_ecc_curve_id) {
+ *ret = t->tpm2_ecc_curve_id;
+ return 0;
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "OpenSSL ECC curve id %d not supported.", openssl_ecc_curve_id);
+}
+
+static int tpm2_ecc_curve_to_openssl_curve_id(TPM2_ECC_CURVE tpm2_ecc_curve_id, int *ret) {
+ assert(ret);
+
+ FOREACH_ARRAY(t, tpm2_openssl_ecc_curve_table, ELEMENTSOF(tpm2_openssl_ecc_curve_table))
+ if (t->tpm2_ecc_curve_id == tpm2_ecc_curve_id) {
+ *ret = t->openssl_ecc_curve_id;
+ return 0;
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM2 ECC curve %u not supported.", tpm2_ecc_curve_id);
+}
+
+#define TPM2_RSA_DEFAULT_EXPONENT UINT32_C(0x10001)
+
+int tpm2_tpm2b_public_to_openssl_pkey(const TPM2B_PUBLIC *public, EVP_PKEY **ret) {
+ int r;
+
+ assert(public);
+ assert(ret);
+
+ const TPMT_PUBLIC *p = &public->publicArea;
+ switch (p->type) {
+ case TPM2_ALG_ECC: {
+ int curve_id;
+ r = tpm2_ecc_curve_to_openssl_curve_id(p->parameters.eccDetail.curveID, &curve_id);
+ if (r < 0)
+ return r;
+
+ const TPMS_ECC_POINT *point = &p->unique.ecc;
+ return ecc_pkey_from_curve_x_y(
+ curve_id,
+ point->x.buffer,
+ point->x.size,
+ point->y.buffer,
+ point->y.size,
+ ret);
+ }
+ case TPM2_ALG_RSA: {
+ /* TPM specification Part 2 ("Structures") section for TPMS_RSA_PARAMS states "An exponent of
+ * zero indicates that the exponent is the default of 2^16 + 1". */
+ uint32_t exponent = htobe32(p->parameters.rsaDetail.exponent ?: TPM2_RSA_DEFAULT_EXPONENT);
+ return rsa_pkey_from_n_e(
+ p->unique.rsa.buffer,
+ p->unique.rsa.size,
+ &exponent,
+ sizeof(exponent),
+ ret);
+ }
+ default:
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM2 asymmetric algorithm 0x%" PRIx16 " not supported.", p->type);
+ }
+}
+
+/* Be careful before changing anything in this function, as the TPM key "name" is calculated using the entire
+ * TPMT_PUBLIC (after marshalling), and that "name" is used (for example) to calculate the policy hash for
+ * the Authorize policy. So we must ensure this conversion of a PEM to TPM2B_PUBLIC does not change the
+ * "name", because it would break unsealing of previously-sealed objects that used (for example)
+ * tpm2_calculate_policy_authorize(). See bug #30546. */
+int tpm2_tpm2b_public_from_openssl_pkey(const EVP_PKEY *pkey, TPM2B_PUBLIC *ret) {
+ int key_id, r;
+
+ assert(pkey);
+ assert(ret);
+
+ TPMT_PUBLIC public = {
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes = TPMA_OBJECT_DECRYPT | TPMA_OBJECT_SIGN_ENCRYPT | TPMA_OBJECT_USERWITHAUTH,
+ .parameters.asymDetail = {
+ .symmetric.algorithm = TPM2_ALG_NULL,
+ .scheme.scheme = TPM2_ALG_NULL,
+ },
+ };
+
+#if OPENSSL_VERSION_MAJOR >= 3
+ key_id = EVP_PKEY_get_id(pkey);
+#else
+ key_id = EVP_PKEY_id(pkey);
+#endif
+
+ switch (key_id) {
+ case EVP_PKEY_EC: {
+ public.type = TPM2_ALG_ECC;
+
+ int curve_id;
+ _cleanup_free_ void *x = NULL, *y = NULL;
+ size_t x_size, y_size;
+ r = ecc_pkey_to_curve_x_y(pkey, &curve_id, &x, &x_size, &y, &y_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get ECC key curve/x/y: %m");
+
+ TPM2_ECC_CURVE curve;
+ r = tpm2_ecc_curve_from_openssl_curve_id(curve_id, &curve);
+ if (r < 0)
+ return r;
+
+ public.parameters.eccDetail.curveID = curve;
+
+ public.parameters.eccDetail.kdf.scheme = TPM2_ALG_NULL;
+
+ r = TPM2B_ECC_PARAMETER_CHECK_SIZE(x_size);
+ if (r < 0)
+ return log_debug_errno(r, "ECC key x size %zu too large.", x_size);
+
+ public.unique.ecc.x = TPM2B_ECC_PARAMETER_MAKE(x, x_size);
+
+ r = TPM2B_ECC_PARAMETER_CHECK_SIZE(y_size);
+ if (r < 0)
+ return log_debug_errno(r, "ECC key y size %zu too large.", y_size);
+
+ public.unique.ecc.y = TPM2B_ECC_PARAMETER_MAKE(y, y_size);
+
+ break;
+ }
+ case EVP_PKEY_RSA: {
+ public.type = TPM2_ALG_RSA;
+
+ _cleanup_free_ void *n = NULL, *e = NULL;
+ size_t n_size, e_size;
+ r = rsa_pkey_to_n_e(pkey, &n, &n_size, &e, &e_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get RSA key n/e: %m");
+
+ r = TPM2B_PUBLIC_KEY_RSA_CHECK_SIZE(n_size);
+ if (r < 0)
+ return log_debug_errno(r, "RSA key n size %zu too large.", n_size);
+
+ public.unique.rsa = TPM2B_PUBLIC_KEY_RSA_MAKE(n, n_size);
+ public.parameters.rsaDetail.keyBits = n_size * 8;
+
+ if (sizeof(uint32_t) < e_size)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "RSA key e size %zu too large.", e_size);
+
+ uint32_t exponent = 0;
+ memcpy(&exponent, e, e_size);
+ exponent = be32toh(exponent) >> (32 - e_size * 8);
+
+ /* TPM specification Part 2 ("Structures") section for TPMS_RSA_PARAMS states "An exponent of
+ * zero indicates that the exponent is the default of 2^16 + 1". However, we have no reason
+ * to special case it in our PEM->TPM2B_PUBLIC conversion, and doing so could break backwards
+ * compatibility, so even if it is the "default" value of 0x10001, we do not set it to 0. */
+ public.parameters.rsaDetail.exponent = exponent;
+
+ break;
+ }
+ default:
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "EVP_PKEY type %d not supported.", key_id);
+ }
+
+ *ret = (TPM2B_PUBLIC) {
+ .size = sizeof(public),
+ .publicArea = public,
+ };
+
+ return 0;
+}
+#endif
+
+int tpm2_tpm2b_public_to_fingerprint(
+ const TPM2B_PUBLIC *public,
+ void **ret_fingerprint,
+ size_t *ret_fingerprint_size) {
+
+#if HAVE_OPENSSL
+ int r;
+
+ assert(public);
+ assert(ret_fingerprint);
+ assert(ret_fingerprint_size);
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+ r = tpm2_tpm2b_public_to_openssl_pkey(public, &pkey);
+ if (r < 0)
+ return r;
+
+ /* Hardcode fingerprint to SHA256 */
+ return pubkey_fingerprint(pkey, EVP_sha256(), ret_fingerprint, ret_fingerprint_size);
+#else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled.");
+#endif
+}
+
+int tpm2_tpm2b_public_from_pem(const void *pem, size_t pem_size, TPM2B_PUBLIC *ret) {
+#if HAVE_OPENSSL
+ int r;
+
+ assert(pem);
+ assert(ret);
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+ r = openssl_pkey_from_pem(pem, pem_size, &pkey);
+ if (r < 0)
+ return r;
+
+ return tpm2_tpm2b_public_from_openssl_pkey(pkey, ret);
+#else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled.");
+#endif
+}
+
+/* Marshal the public, private, and seed objects into a single nonstandard 'blob'. The public and private
+ * objects are required, while the seed is optional. This is not a (publicly) standard format, this is
+ * specific to how we currently store the sealed object. This 'blob' can be unmarshalled by
+ * tpm2_unmarshal_blob(). */
+int tpm2_marshal_blob(
+ const TPM2B_PUBLIC *public,
+ const TPM2B_PRIVATE *private,
+ const TPM2B_ENCRYPTED_SECRET *seed,
+ void **ret_blob,
+ size_t *ret_blob_size) {
+
+ TSS2_RC rc;
+
+ assert(public);
+ assert(private);
+ assert(ret_blob);
+ assert(ret_blob_size);
+
+ size_t max_size = sizeof(*private) + sizeof(*public);
+ if (seed)
+ max_size += sizeof(*seed);
+
+ _cleanup_free_ void *blob = malloc(max_size);
+ if (!blob)
+ return log_oom_debug();
+
+ size_t blob_size = 0;
+ rc = sym_Tss2_MU_TPM2B_PRIVATE_Marshal(private, blob, max_size, &blob_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal private key: %s", sym_Tss2_RC_Decode(rc));
+
+ rc = sym_Tss2_MU_TPM2B_PUBLIC_Marshal(public, blob, max_size, &blob_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal public key: %s", sym_Tss2_RC_Decode(rc));
+
+ if (seed) {
+ rc = sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Marshal(seed, blob, max_size, &blob_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal encrypted seed: %s", sym_Tss2_RC_Decode(rc));
+ }
+
+ *ret_blob = TAKE_PTR(blob);
+ *ret_blob_size = blob_size;
+
+ return 0;
+}
+
+/* Unmarshal the 'blob' into public, private, and seed objects. The public and private objects are required
+ * in the 'blob', while the seed is optional. This is not a (publicly) standard format, this is specific to
+ * how we currently store the sealed object. This expects the 'blob' to have been created by
+ * tpm2_marshal_blob(). */
+int tpm2_unmarshal_blob(
+ const void *blob,
+ size_t blob_size,
+ TPM2B_PUBLIC *ret_public,
+ TPM2B_PRIVATE *ret_private,
+ TPM2B_ENCRYPTED_SECRET *ret_seed) {
+
+ TSS2_RC rc;
+
+ assert(blob);
+ assert(ret_public);
+ assert(ret_private);
+ assert(ret_seed);
+
+ TPM2B_PRIVATE private = {};
+ size_t offset = 0;
+ rc = sym_Tss2_MU_TPM2B_PRIVATE_Unmarshal(blob, blob_size, &offset, &private);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unmarshal private key: %s", sym_Tss2_RC_Decode(rc));
+
+ TPM2B_PUBLIC public = {};
+ rc = sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal(blob, blob_size, &offset, &public);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unmarshal public key: %s", sym_Tss2_RC_Decode(rc));
+
+ TPM2B_ENCRYPTED_SECRET seed = {};
+ if (blob_size > offset) {
+ rc = sym_Tss2_MU_TPM2B_ENCRYPTED_SECRET_Unmarshal(blob, blob_size, &offset, &seed);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unmarshal encrypted seed: %s", sym_Tss2_RC_Decode(rc));
+ }
+
+ *ret_public = public;
+ *ret_private = private;
+ *ret_seed = seed;
+
+ return 0;
+}
+
+/* Calculate a serialized handle. Once the upstream tpm2-tss library provides an api to do this, we can
+ * remove this function. The addition of this functionality in tpm2-tss may be tracked here:
+ * https://github.com/tpm2-software/tpm2-tss/issues/2575 */
+int tpm2_calculate_serialize(
+ TPM2_HANDLE handle,
+ const TPM2B_NAME *name,
+ const TPM2B_PUBLIC *public,
+ void **ret_serialized,
+ size_t *ret_serialized_size) {
+
+ TSS2_RC rc;
+
+ assert(name);
+ assert(public);
+ assert(ret_serialized);
+ assert(ret_serialized_size);
+
+ size_t max_size = sizeof(TPM2_HANDLE) + sizeof(TPM2B_NAME) + sizeof(uint32_t) + sizeof(TPM2B_PUBLIC);
+ _cleanup_free_ void *serialized = malloc(max_size);
+ if (!serialized)
+ return log_oom_debug();
+
+ size_t serialized_size = 0;
+ rc = sym_Tss2_MU_TPM2_HANDLE_Marshal(handle, serialized, max_size, &serialized_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal tpm handle: %s", sym_Tss2_RC_Decode(rc));
+
+ rc = sym_Tss2_MU_TPM2B_NAME_Marshal(name, serialized, max_size, &serialized_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal name: %s", sym_Tss2_RC_Decode(rc));
+
+ /* This is defined (non-publicly) in the tpm2-tss source as IESYSC_KEY_RSRC, to a value of "1". */
+ rc = sym_Tss2_MU_UINT32_Marshal(UINT32_C(1), serialized, max_size, &serialized_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal esys resource id: %s", sym_Tss2_RC_Decode(rc));
+
+ rc = sym_Tss2_MU_TPM2B_PUBLIC_Marshal(public, serialized, max_size, &serialized_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal public: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_serialized = TAKE_PTR(serialized);
+ *ret_serialized_size = serialized_size;
+
+ return 0;
+}
+
+/* Serialize a handle. This produces a binary object that can be later deserialized (by the same TPM), even
+ * across restarts of the TPM or reboots (assuming the handle is persistent). */
+int tpm2_serialize(
+ Tpm2Context *c,
+ const Tpm2Handle *handle,
+ void **ret_serialized,
+ size_t *ret_serialized_size) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(handle);
+ assert(ret_serialized);
+ assert(ret_serialized_size);
+
+ _cleanup_(Esys_Freep) unsigned char *serialized = NULL;
+ size_t size = 0;
+ rc = sym_Esys_TR_Serialize(c->esys_context, handle->esys_handle, &serialized, &size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to serialize: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_serialized = TAKE_PTR(serialized);
+ *ret_serialized_size = size;
+
+ return 0;
+}
+
+int tpm2_deserialize(
+ Tpm2Context *c,
+ const void *serialized,
+ size_t serialized_size,
+ Tpm2Handle **ret_handle) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(serialized);
+ assert(ret_handle);
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *handle = NULL;
+ r = tpm2_handle_new(c, &handle);
+ if (r < 0)
+ return r;
+
+ /* Since this is an existing handle in the TPM we should not implicitly flush it. */
+ handle->flush = false;
+
+ rc = sym_Esys_TR_Deserialize(c->esys_context, serialized, serialized_size, &handle->esys_handle);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to deserialize: %s", sym_Tss2_RC_Decode(rc));
+
+ *ret_handle = TAKE_PTR(handle);
+
+ return 0;
+}
+
+#if HAVE_OPENSSL
+
+/* KDFa() as defined by the TPM spec. */
+static int tpm2_kdfa(
+ TPMI_ALG_HASH hash_alg,
+ const void *key,
+ size_t key_len,
+ const char *label,
+ const void *context,
+ size_t context_len,
+ size_t bits,
+ void **ret_key,
+ size_t *ret_key_len) {
+
+ int r;
+
+ assert(key);
+ assert(label);
+ assert(context || context_len == 0);
+ assert(bits > 0);
+ assert(bits <= SIZE_MAX - 7);
+ assert(ret_key);
+ assert(ret_key_len);
+
+ log_debug("Calculating KDFa().");
+
+ size_t len = DIV_ROUND_UP(bits, 8);
+
+ const char *hash_alg_name = tpm2_hash_alg_to_string(hash_alg);
+ if (!hash_alg_name)
+ return -EOPNOTSUPP;
+
+ _cleanup_free_ void *buf = NULL;
+ r = kdf_kb_hmac_derive(
+ "COUNTER",
+ hash_alg_name,
+ key,
+ key_len,
+ label,
+ strlen(label),
+ context,
+ context_len,
+ /* seed= */ NULL,
+ /* seed_len= */ 0,
+ len,
+ &buf);
+ if (r < 0)
+ return r;
+
+ /* If the number of bits results in a partial byte, the TPM spec requires we zero the unrequested
+ * bits in the MSB (i.e. at index 0). From the spec Part 1 ("Architecture") section on Key
+ * Derivation Function, specifically KDFa():
+ *
+ * "The implied return from this function is a sequence of octets with a length equal to (bits + 7) /
+ * 8. If bits is not an even multiple of 8, then the returned value occupies the least significant
+ * bits of the returned octet array, and the additional, high-order bits in the 0th octet are
+ * CLEAR. The unused bits of the most significant octet (MSO) are masked off and not shifted." */
+ size_t partial = bits % 8;
+ if (partial > 0)
+ ((uint8_t*) buf)[0] &= 0xffu >> (8 - partial);
+
+ *ret_key = TAKE_PTR(buf);
+ *ret_key_len = len;
+
+ return 0;
+}
+
+/* KDFe() as defined by the TPM spec. */
+static int tpm2_kdfe(
+ TPMI_ALG_HASH hash_alg,
+ const void *shared_secret,
+ size_t shared_secret_len,
+ const char *label,
+ const void *context_u,
+ size_t context_u_size,
+ const void *context_v,
+ size_t context_v_size,
+ size_t bits,
+ void **ret_key,
+ size_t *ret_key_len) {
+
+ int r;
+
+ assert(shared_secret);
+ assert(label);
+ assert(context_u);
+ assert(context_v);
+ assert(bits > 0);
+ assert(bits <= SIZE_MAX - 7);
+ assert(ret_key);
+ assert(ret_key_len);
+
+ log_debug("Calculating KDFe().");
+
+ size_t len = DIV_ROUND_UP(bits, 8);
+
+ const char *hash_alg_name = tpm2_hash_alg_to_string(hash_alg);
+ if (!hash_alg_name)
+ return -EOPNOTSUPP;
+
+ size_t info_len = strlen(label) + 1 + context_u_size + context_v_size;
+ _cleanup_free_ void *info = malloc(info_len);
+ if (!info)
+ return log_oom_debug();
+
+ void *end = mempcpy(mempcpy(stpcpy(info, label) + 1, context_u, context_u_size), context_v, context_v_size);
+ /* assert we copied exactly the right amount that we allocated */
+ assert(end > info && (uintptr_t) end - (uintptr_t) info == info_len);
+
+ _cleanup_free_ void *buf = NULL;
+ r = kdf_ss_derive(
+ hash_alg_name,
+ shared_secret,
+ shared_secret_len,
+ /* salt= */ NULL,
+ /* salt_size= */ 0,
+ info,
+ info_len,
+ len,
+ &buf);
+ if (r < 0)
+ return r;
+
+ *ret_key = TAKE_PTR(buf);
+ *ret_key_len = len;
+
+ return 0;
+}
+
+static int tpm2_calculate_seal_public(
+ const TPM2B_PUBLIC *parent,
+ const TPMA_OBJECT *attributes,
+ const TPM2B_DIGEST *policy,
+ const TPM2B_DIGEST *seed,
+ const void *secret,
+ size_t secret_size,
+ TPM2B_PUBLIC *ret) {
+
+ int r;
+
+ assert(parent);
+ assert(seed);
+ assert(secret);
+ assert(ret);
+
+ log_debug("Calculating public part of sealed object.");
+
+ struct iovec data[] = {
+ IOVEC_MAKE((void*) seed->buffer, seed->size),
+ IOVEC_MAKE((void*) secret, secret_size),
+ };
+ TPM2B_DIGEST unique;
+ r = tpm2_digest_many(
+ parent->publicArea.nameAlg,
+ &unique,
+ data,
+ ELEMENTSOF(data),
+ /* extend= */ false);
+ if (r < 0)
+ return r;
+
+ *ret = (TPM2B_PUBLIC) {
+ .size = sizeof(TPMT_PUBLIC),
+ .publicArea = {
+ .type = TPM2_ALG_KEYEDHASH,
+ .nameAlg = parent->publicArea.nameAlg,
+ .objectAttributes = attributes ? *attributes : 0,
+ .authPolicy = policy ? *policy : TPM2B_DIGEST_MAKE(NULL, unique.size),
+ .parameters.keyedHashDetail.scheme.scheme = TPM2_ALG_NULL,
+ .unique.keyedHash = unique,
+ },
+ };
+
+ return 0;
+}
+
+static int tpm2_calculate_seal_private(
+ const TPM2B_PUBLIC *parent,
+ const TPM2B_NAME *name,
+ const char *pin,
+ const TPM2B_DIGEST *seed,
+ const void *secret,
+ size_t secret_size,
+ TPM2B_PRIVATE *ret) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(parent);
+ assert(name);
+ assert(seed);
+ assert(secret);
+ assert(ret);
+
+ log_debug("Calculating private part of sealed object.");
+
+ _cleanup_free_ void *storage_key = NULL;
+ size_t storage_key_size;
+ r = tpm2_kdfa(parent->publicArea.nameAlg,
+ seed->buffer,
+ seed->size,
+ "STORAGE",
+ name->name,
+ name->size,
+ (size_t) parent->publicArea.parameters.asymDetail.symmetric.keyBits.sym,
+ &storage_key,
+ &storage_key_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not calculate storage key KDFa: %m");
+
+ r = tpm2_hash_alg_to_size(parent->publicArea.nameAlg);
+ if (r < 0)
+ return -EOPNOTSUPP;
+
+ size_t bits = (size_t) r * 8;
+
+ _cleanup_free_ void *integrity_key = NULL;
+ size_t integrity_key_size;
+ r = tpm2_kdfa(parent->publicArea.nameAlg,
+ seed->buffer,
+ seed->size,
+ "INTEGRITY",
+ /* context= */ NULL,
+ /* n_context= */ 0,
+ bits,
+ &integrity_key,
+ &integrity_key_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not calculate integrity key KDFa: %m");
+
+ TPM2B_AUTH auth = {};
+ if (pin) {
+ r = tpm2_get_pin_auth(parent->publicArea.nameAlg, pin, &auth);
+ if (r < 0)
+ return r;
+ }
+
+ TPM2B_SENSITIVE sensitive = {
+ .size = sizeof(TPMT_SENSITIVE),
+ .sensitiveArea = {
+ .sensitiveType = TPM2_ALG_KEYEDHASH,
+ .authValue = auth,
+ .seedValue = *seed,
+ .sensitive.bits = TPM2B_SENSITIVE_DATA_MAKE(secret, secret_size),
+ },
+ };
+
+ _cleanup_free_ void *marshalled_sensitive = malloc(sizeof(sensitive));
+ if (!marshalled_sensitive)
+ return log_oom_debug();
+
+ size_t marshalled_sensitive_size = 0;
+ rc = sym_Tss2_MU_TPM2B_SENSITIVE_Marshal(
+ &sensitive,
+ marshalled_sensitive,
+ sizeof(sensitive),
+ &marshalled_sensitive_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal sensitive: %s", sym_Tss2_RC_Decode(rc));
+
+ const char *sym_alg = tpm2_sym_alg_to_string(parent->publicArea.parameters.asymDetail.symmetric.algorithm);
+ if (!sym_alg)
+ return -EOPNOTSUPP;
+
+ const char *sym_mode = tpm2_sym_mode_to_string(parent->publicArea.parameters.asymDetail.symmetric.mode.sym);
+ if (!sym_mode)
+ return -EOPNOTSUPP;
+
+ _cleanup_free_ void *encrypted_sensitive = NULL;
+ size_t encrypted_sensitive_size;
+ r = openssl_cipher(
+ sym_alg,
+ parent->publicArea.parameters.asymDetail.symmetric.keyBits.sym,
+ sym_mode,
+ storage_key, storage_key_size,
+ /* iv= */ NULL, /* n_iv= */ 0,
+ marshalled_sensitive, marshalled_sensitive_size,
+ &encrypted_sensitive, &encrypted_sensitive_size);
+ if (r < 0)
+ return r;
+
+ const char *hash_alg_name = tpm2_hash_alg_to_string(parent->publicArea.nameAlg);
+ if (!hash_alg_name)
+ return -EOPNOTSUPP;
+
+ _cleanup_free_ void *hmac_buffer = NULL;
+ size_t hmac_size = 0;
+ struct iovec hmac_data[] = {
+ IOVEC_MAKE((void*) encrypted_sensitive, encrypted_sensitive_size),
+ IOVEC_MAKE((void*) name->name, name->size),
+ };
+ r = openssl_hmac_many(
+ hash_alg_name,
+ integrity_key,
+ integrity_key_size,
+ hmac_data,
+ ELEMENTSOF(hmac_data),
+ &hmac_buffer,
+ &hmac_size);
+ if (r < 0)
+ return r;
+
+ TPM2B_DIGEST outer_hmac = TPM2B_DIGEST_MAKE(hmac_buffer, hmac_size);
+
+ TPM2B_PRIVATE private = {};
+ size_t private_size = 0;
+ rc = sym_Tss2_MU_TPM2B_DIGEST_Marshal(
+ &outer_hmac,
+ private.buffer,
+ sizeof(private.buffer),
+ &private_size);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal digest: %s", sym_Tss2_RC_Decode(rc));
+ private.size = private_size;
+
+ assert(sizeof(private.buffer) - private.size >= encrypted_sensitive_size);
+ memcpy_safe(&private.buffer[private.size], encrypted_sensitive, encrypted_sensitive_size);
+ private.size += encrypted_sensitive_size;
+
+ *ret = private;
+
+ return 0;
+}
+
+static int tpm2_calculate_seal_rsa_seed(
+ const TPM2B_PUBLIC *parent,
+ void **ret_seed,
+ size_t *ret_seed_size,
+ void **ret_encrypted_seed,
+ size_t *ret_encrypted_seed_size) {
+
+ int r;
+
+ assert(parent);
+ assert(ret_seed);
+ assert(ret_seed_size);
+ assert(ret_encrypted_seed);
+ assert(ret_encrypted_seed_size);
+
+ log_debug("Calculating encrypted seed for RSA sealed object.");
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *parent_pkey = NULL;
+ r = tpm2_tpm2b_public_to_openssl_pkey(parent, &parent_pkey);
+ if (r < 0)
+ return log_debug_errno(r, "Could not convert TPM2B_PUBLIC to OpenSSL PKEY: %m");
+
+ r = tpm2_hash_alg_to_size(parent->publicArea.nameAlg);
+ if (r < 0)
+ return -EOPNOTSUPP;
+
+ size_t seed_size = (size_t) r;
+
+ _cleanup_free_ void *seed = malloc(seed_size);
+ if (!seed)
+ return log_oom_debug();
+
+ r = crypto_random_bytes(seed, seed_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to generate random seed: %m");
+
+ const char *hash_alg_name = tpm2_hash_alg_to_string(parent->publicArea.nameAlg);
+ if (!hash_alg_name)
+ return -EOPNOTSUPP;
+
+ _cleanup_free_ void *encrypted_seed = NULL;
+ size_t encrypted_seed_size;
+ r = rsa_oaep_encrypt_bytes(
+ parent_pkey,
+ hash_alg_name,
+ "DUPLICATE",
+ seed,
+ seed_size,
+ &encrypted_seed,
+ &encrypted_seed_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not RSA-OAEP encrypt random seed: %m");
+
+ *ret_seed = TAKE_PTR(seed);
+ *ret_seed_size = seed_size;
+ *ret_encrypted_seed = TAKE_PTR(encrypted_seed);
+ *ret_encrypted_seed_size = encrypted_seed_size;
+
+ return 0;
+}
+
+static int tpm2_calculate_seal_ecc_seed(
+ const TPM2B_PUBLIC *parent,
+ void **ret_seed,
+ size_t *ret_seed_size,
+ void **ret_encrypted_seed,
+ size_t *ret_encrypted_seed_size) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(parent);
+ assert(ret_seed);
+ assert(ret_seed_size);
+ assert(ret_encrypted_seed);
+ assert(ret_encrypted_seed_size);
+
+ log_debug("Calculating encrypted seed for ECC sealed object.");
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *parent_pkey = NULL;
+ r = tpm2_tpm2b_public_to_openssl_pkey(parent, &parent_pkey);
+ if (r < 0)
+ return log_debug_errno(r, "Could not convert TPM2B_PUBLIC to OpenSSL PKEY: %m");
+
+ int curve_id;
+ r = ecc_pkey_to_curve_x_y(
+ parent_pkey,
+ &curve_id,
+ /* ret_x= */ NULL, /* ret_x_size= */ 0,
+ /* ret_y= */ NULL, /* ret_y_size= */ 0);
+ if (r < 0)
+ return r;
+
+ _cleanup_(EVP_PKEY_freep) EVP_PKEY *pkey = NULL;
+ r = ecc_pkey_new(curve_id, &pkey);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ void *shared_secret = NULL;
+ size_t shared_secret_size;
+ r = ecc_ecdh(pkey, parent_pkey, &shared_secret, &shared_secret_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not generate ECC shared secret: %m");
+
+ _cleanup_free_ void *x = NULL, *y = NULL;
+ size_t x_size, y_size;
+ r = ecc_pkey_to_curve_x_y(pkey, /* curve_id= */ NULL, &x, &x_size, &y, &y_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get ECC get x/y: %m");
+
+ r = TPM2B_ECC_PARAMETER_CHECK_SIZE(x_size);
+ if (r < 0)
+ return log_debug_errno(r, "ECC point x size %zu is too large: %m", x_size);
+
+ r = TPM2B_ECC_PARAMETER_CHECK_SIZE(y_size);
+ if (r < 0)
+ return log_debug_errno(r, "ECC point y size %zu is too large: %m", y_size);
+
+ TPMS_ECC_POINT point = {
+ .x = TPM2B_ECC_PARAMETER_MAKE(x, x_size),
+ .y = TPM2B_ECC_PARAMETER_MAKE(y, y_size),
+ };
+
+ _cleanup_free_ void *encrypted_seed = malloc(sizeof(point));
+ if (!encrypted_seed)
+ return log_oom_debug();
+
+ size_t encrypted_seed_size = 0;
+ rc = sym_Tss2_MU_TPMS_ECC_POINT_Marshal(&point, encrypted_seed, sizeof(point), &encrypted_seed_size);
+ if (rc != TPM2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal ECC point: %s", sym_Tss2_RC_Decode(rc));
+
+ r = tpm2_hash_alg_to_size(parent->publicArea.nameAlg);
+ if (r < 0)
+ return -EOPNOTSUPP;
+
+ size_t bits = (size_t) r * 8;
+
+ _cleanup_free_ void *seed = NULL;
+ size_t seed_size;
+ r = tpm2_kdfe(parent->publicArea.nameAlg,
+ shared_secret,
+ shared_secret_size,
+ "DUPLICATE",
+ x,
+ x_size,
+ parent->publicArea.unique.ecc.x.buffer,
+ parent->publicArea.unique.ecc.x.size,
+ bits,
+ &seed,
+ &seed_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not calculate KDFe: %m");
+
+ *ret_seed = TAKE_PTR(seed);
+ *ret_seed_size = seed_size;
+ *ret_encrypted_seed = TAKE_PTR(encrypted_seed);
+ *ret_encrypted_seed_size = encrypted_seed_size;
+
+ return 0;
+}
+
+static int tpm2_calculate_seal_seed(
+ const TPM2B_PUBLIC *parent,
+ TPM2B_DIGEST *ret_seed,
+ TPM2B_ENCRYPTED_SECRET *ret_encrypted_seed) {
+
+ int r;
+
+ assert(parent);
+ assert(ret_seed);
+ assert(ret_encrypted_seed);
+
+ log_debug("Calculating encrypted seed for sealed object.");
+
+ _cleanup_free_ void *seed = NULL, *encrypted_seed = NULL;
+ size_t seed_size, encrypted_seed_size;
+ if (parent->publicArea.type == TPM2_ALG_RSA)
+ r = tpm2_calculate_seal_rsa_seed(parent, &seed, &seed_size, &encrypted_seed, &encrypted_seed_size);
+ else if (parent->publicArea.type == TPM2_ALG_ECC)
+ r = tpm2_calculate_seal_ecc_seed(parent, &seed, &seed_size, &encrypted_seed, &encrypted_seed_size);
+ else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "Unsupported parent key type 0x%" PRIx16, parent->publicArea.type);
+ if (r < 0)
+ return log_debug_errno(r, "Could not calculate encrypted seed: %m");
+
+ *ret_seed = TPM2B_DIGEST_MAKE(seed, seed_size);
+ *ret_encrypted_seed = TPM2B_ENCRYPTED_SECRET_MAKE(encrypted_seed, encrypted_seed_size);
+
+ return 0;
+}
+
+#endif /* HAVE_OPENSSL */
+
+int tpm2_calculate_seal(
+ TPM2_HANDLE parent_handle,
+ const TPM2B_PUBLIC *parent_public,
+ const TPMA_OBJECT *attributes,
+ const void *secret,
+ size_t secret_size,
+ const TPM2B_DIGEST *policy,
+ const char *pin,
+ void **ret_secret,
+ size_t *ret_secret_size,
+ void **ret_blob,
+ size_t *ret_blob_size,
+ void **ret_serialized_parent,
+ size_t *ret_serialized_parent_size) {
+
+#if HAVE_OPENSSL
+ int r;
+
+ assert(parent_public);
+ assert(secret || secret_size == 0);
+ assert(secret || ret_secret);
+ assert(!(secret && ret_secret)); /* Either provide a secret, or we create one, but not both */
+ assert(ret_blob);
+ assert(ret_blob_size);
+ assert(ret_serialized_parent);
+ assert(ret_serialized_parent_size);
+
+ log_debug("Calculating sealed object.");
+
+ /* Default to the SRK. */
+ if (parent_handle == 0)
+ parent_handle = TPM2_SRK_HANDLE;
+
+ switch (TPM2_HANDLE_TYPE(parent_handle)) {
+ case TPM2_HT_PERSISTENT:
+ case TPM2_HT_NV_INDEX:
+ break;
+ case TPM2_HT_TRANSIENT:
+ log_warning("Handle is transient, sealed secret may not be recoverable.");
+ break;
+ default:
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Handle 0x%" PRIx32 " not persistent, transient, or NV.",
+ parent_handle);
+ }
+
+ _cleanup_(erase_and_freep) void *generated_secret = NULL;
+ if (!secret) {
+ /* No secret provided, generate a random secret. We use SHA256 digest length, though it can
+ * be up to TPM2_MAX_SEALED_DATA. The secret length is not limited to the nameAlg hash
+ * size. */
+ secret_size = TPM2_SHA256_DIGEST_SIZE;
+ generated_secret = malloc(secret_size);
+ if (!generated_secret)
+ return log_oom_debug();
+
+ r = crypto_random_bytes(generated_secret, secret_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to generate secret key: %m");
+
+ secret = generated_secret;
+ }
+
+ if (secret_size > TPM2_MAX_SEALED_DATA)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOVERFLOW),
+ "Secret size %zu too large, limit is %d bytes.",
+ secret_size, TPM2_MAX_SEALED_DATA);
+
+ TPM2B_DIGEST random_seed;
+ TPM2B_ENCRYPTED_SECRET seed;
+ r = tpm2_calculate_seal_seed(parent_public, &random_seed, &seed);
+ if (r < 0)
+ return r;
+
+ TPM2B_PUBLIC public;
+ r = tpm2_calculate_seal_public(parent_public, attributes, policy, &random_seed, secret, secret_size, &public);
+ if (r < 0)
+ return r;
+
+ TPM2B_NAME name;
+ r = tpm2_calculate_pubkey_name(&public.publicArea, &name);
+ if (r < 0)
+ return r;
+
+ TPM2B_PRIVATE private;
+ r = tpm2_calculate_seal_private(parent_public, &name, pin, &random_seed, secret, secret_size, &private);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ void *blob = NULL;
+ size_t blob_size;
+ r = tpm2_marshal_blob(&public, &private, &seed, &blob, &blob_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not create sealed blob: %m");
+
+ TPM2B_NAME parent_name;
+ r = tpm2_calculate_pubkey_name(&parent_public->publicArea, &parent_name);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ void *serialized_parent = NULL;
+ size_t serialized_parent_size;
+ r = tpm2_calculate_serialize(
+ parent_handle,
+ &parent_name,
+ parent_public,
+ &serialized_parent,
+ &serialized_parent_size);
+ if (r < 0)
+ return r;
+
+ if (ret_secret)
+ *ret_secret = TAKE_PTR(generated_secret);
+ if (ret_secret_size)
+ *ret_secret_size = secret_size;
+ *ret_blob = TAKE_PTR(blob);
+ *ret_blob_size = blob_size;
+ *ret_serialized_parent = TAKE_PTR(serialized_parent);
+ *ret_serialized_parent_size = serialized_parent_size;
+
+ return 0;
+#else /* HAVE_OPENSSL */
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled.");
+#endif
+}
+
+int tpm2_seal(Tpm2Context *c,
+ uint32_t seal_key_handle,
+ const TPM2B_DIGEST *policy,
+ const char *pin,
+ void **ret_secret,
+ size_t *ret_secret_size,
+ void **ret_blob,
+ size_t *ret_blob_size,
+ uint16_t *ret_primary_alg,
+ void **ret_srk_buf,
+ size_t *ret_srk_buf_size) {
+
+ uint16_t primary_alg = 0;
+ int r;
+
+ assert(ret_secret);
+ assert(ret_secret_size);
+ assert(ret_blob);
+ assert(ret_blob_size);
+
+ /* So here's what we do here: we connect to the TPM2 chip. It persistently contains a "seed" key that
+ * is randomized when the TPM2 is first initialized or reset and remains stable across boots. We
+ * generate a "primary" key pair derived from that (ECC if possible, RSA as fallback). Given the seed
+ * remains fixed this will result in the same key pair whenever we specify the exact same parameters
+ * for it. We then create a PCR-bound policy session, which calculates a hash on the current PCR
+ * values of the indexes we specify. We then generate a randomized key on the host (which is the key
+ * we actually enroll in the LUKS2 keyslots), which we upload into the TPM2, where it is encrypted
+ * with the "primary" key, taking the PCR policy session into account. We then download the encrypted
+ * key from the TPM2 ("sealing") and marshall it into binary form, which is ultimately placed in the
+ * LUKS2 JSON header.
+ *
+ * The TPM2 "seed" key and "primary" keys never leave the TPM2 chip (and cannot be extracted at
+ * all). The random key we enroll in LUKS2 we generate on the host using the Linux random device. It
+ * is stored in the LUKS2 JSON only in encrypted form with the "primary" key of the TPM2 chip, thus
+ * binding the unlocking to the TPM2 chip. */
+
+ usec_t start = now(CLOCK_MONOTONIC);
+
+ /* We use a keyed hash object (i.e. HMAC) to store the secret key we want to use for unlocking the
+ * LUKS2 volume with. We don't ever use for HMAC/keyed hash operations however, we just use it
+ * because it's a key type that is universally supported and suitable for symmetric binary blobs. */
+ TPMT_PUBLIC hmac_template = {
+ .type = TPM2_ALG_KEYEDHASH,
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes = TPMA_OBJECT_FIXEDTPM | TPMA_OBJECT_FIXEDPARENT,
+ .parameters.keyedHashDetail.scheme.scheme = TPM2_ALG_NULL,
+ .unique.keyedHash.size = SHA256_DIGEST_SIZE,
+ .authPolicy = policy ? *policy : TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE),
+ };
+
+ TPMS_SENSITIVE_CREATE hmac_sensitive = {
+ .data.size = hmac_template.unique.keyedHash.size,
+ };
+
+ CLEANUP_ERASE(hmac_sensitive);
+
+ if (pin) {
+ r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &hmac_sensitive.userAuth);
+ if (r < 0)
+ return r;
+ }
+
+ assert(sizeof(hmac_sensitive.data.buffer) >= hmac_sensitive.data.size);
+
+ (void) tpm2_credit_random(c);
+
+ log_debug("Generating secret key data.");
+
+ r = crypto_random_bytes(hmac_sensitive.data.buffer, hmac_sensitive.data.size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to generate secret key: %m");
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *primary_handle = NULL;
+ if (ret_srk_buf) {
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *primary_public = NULL;
+
+ if (IN_SET(seal_key_handle, 0, TPM2_SRK_HANDLE)) {
+ r = tpm2_get_or_create_srk(
+ c,
+ /* session= */ NULL,
+ &primary_public,
+ /* ret_name= */ NULL,
+ /* ret_qname= */ NULL,
+ &primary_handle);
+ if (r < 0)
+ return r;
+ } else if (IN_SET(TPM2_HANDLE_TYPE(seal_key_handle), TPM2_HT_TRANSIENT, TPM2_HT_PERSISTENT)) {
+ r = tpm2_index_to_handle(
+ c,
+ seal_key_handle,
+ /* session= */ NULL,
+ &primary_public,
+ /* ret_name= */ NULL,
+ /* ret_qname= */ NULL,
+ &primary_handle);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ /* We do NOT automatically create anything other than the SRK */
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
+ "No handle found at index 0x%" PRIx32, seal_key_handle);
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Seal key handle 0x%" PRIx32 " is neither transient nor persistent.",
+ seal_key_handle);
+
+ primary_alg = primary_public->publicArea.type;
+ } else {
+ if (seal_key_handle != 0)
+ log_debug("Using primary alg sealing, but seal key handle also provided; ignoring seal key handle.");
+
+ /* TODO: force all callers to provide ret_srk_buf, so we can stop sealing with the legacy templates. */
+ primary_alg = TPM2_ALG_ECC;
+
+ TPM2B_PUBLIC template = {
+ .size = sizeof(TPMT_PUBLIC),
+ };
+ r = tpm2_get_legacy_template(primary_alg, &template.publicArea);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get legacy ECC template: %m");
+
+ if (!tpm2_supports_tpmt_public(c, &template.publicArea)) {
+ primary_alg = TPM2_ALG_RSA;
+
+ r = tpm2_get_legacy_template(primary_alg, &template.publicArea);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get legacy RSA template: %m");
+
+ if (!tpm2_supports_tpmt_public(c, &template.publicArea))
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM does not support either ECC or RSA legacy template.");
+ }
+
+ r = tpm2_create_primary(
+ c,
+ /* session= */ NULL,
+ &template,
+ /* sensitive= */ NULL,
+ /* ret_public= */ NULL,
+ &primary_handle);
+ if (r < 0)
+ return r;
+ }
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL;
+ r = tpm2_make_encryption_session(c, primary_handle, /* bind_key= */ NULL, &encryption_session);
+ if (r < 0)
+ return r;
+
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL;
+ _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL;
+ r = tpm2_create(c, primary_handle, encryption_session, &hmac_template, &hmac_sensitive, &public, &private);
+ if (r < 0)
+ return r;
+
+ _cleanup_(erase_and_freep) void *secret = NULL;
+ secret = memdup(hmac_sensitive.data.buffer, hmac_sensitive.data.size);
+ if (!secret)
+ return log_oom_debug();
+
+ log_debug("Marshalling private and public part of HMAC key.");
+
+ _cleanup_free_ void *blob = NULL;
+ size_t blob_size = 0;
+ r = tpm2_marshal_blob(public, private, /* seed= */ NULL, &blob, &blob_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not create sealed blob: %m");
+
+ if (DEBUG_LOGGING)
+ log_debug("Completed TPM2 key sealing in %s.", FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - start, 1));
+
+ _cleanup_free_ void *srk_buf = NULL;
+ size_t srk_buf_size = 0;
+ if (ret_srk_buf) {
+ _cleanup_(Esys_Freep) void *tmp = NULL;
+ r = tpm2_serialize(c, primary_handle, &tmp, &srk_buf_size);
+ if (r < 0)
+ return r;
+
+ /*
+ * make a copy since we don't want the caller to understand that
+ * ESYS allocated the pointer. It would make tracking what deallocator
+ * to use for srk_buf in which context a PITA.
+ */
+ srk_buf = memdup(tmp, srk_buf_size);
+ if (!srk_buf)
+ return log_oom_debug();
+
+ *ret_srk_buf = TAKE_PTR(srk_buf);
+ *ret_srk_buf_size = srk_buf_size;
+ }
+
+ *ret_secret = TAKE_PTR(secret);
+ *ret_secret_size = hmac_sensitive.data.size;
+ *ret_blob = TAKE_PTR(blob);
+ *ret_blob_size = blob_size;
+
+ if (ret_primary_alg)
+ *ret_primary_alg = primary_alg;
+
+ return 0;
+}
+
+#define RETRY_UNSEAL_MAX 30u
+
+int tpm2_unseal(Tpm2Context *c,
+ uint32_t hash_pcr_mask,
+ uint16_t pcr_bank,
+ const void *pubkey,
+ size_t pubkey_size,
+ uint32_t pubkey_pcr_mask,
+ JsonVariant *signature,
+ const char *pin,
+ const Tpm2PCRLockPolicy *pcrlock_policy,
+ uint16_t primary_alg,
+ const void *blob,
+ size_t blob_size,
+ const void *known_policy_hash,
+ size_t known_policy_hash_size,
+ const void *srk_buf,
+ size_t srk_buf_size,
+ void **ret_secret,
+ size_t *ret_secret_size) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(blob);
+ assert(blob_size > 0);
+ assert(known_policy_hash_size == 0 || known_policy_hash);
+ assert(pubkey_size == 0 || pubkey);
+ assert(ret_secret);
+ assert(ret_secret_size);
+
+ assert(TPM2_PCR_MASK_VALID(hash_pcr_mask));
+ assert(TPM2_PCR_MASK_VALID(pubkey_pcr_mask));
+
+ /* So here's what we do here: We connect to the TPM2 chip. As we do when sealing we generate a
+ * "primary" key on the TPM2 chip, with the same parameters as well as a PCR-bound policy session.
+ * Given we pass the same parameters, this will result in the same "primary" key, and same policy
+ * hash (the latter of course, only if the PCR values didn't change in between). We unmarshal the
+ * encrypted key we stored in the LUKS2 JSON token header and upload it into the TPM2, where it is
+ * decrypted if the seed and the PCR policy were right ("unsealing"). We then download the result,
+ * and use it to unlock the LUKS2 volume. */
+
+ usec_t start = now(CLOCK_MONOTONIC);
+
+ TPM2B_PUBLIC public;
+ TPM2B_PRIVATE private;
+ TPM2B_ENCRYPTED_SECRET seed = {};
+ r = tpm2_unmarshal_blob(blob, blob_size, &public, &private, &seed);
+ if (r < 0)
+ return log_debug_errno(r, "Could not extract parts from blob: %m");
+
+ /* Older code did not save the pcr_bank, and unsealing needed to detect the best pcr bank to use,
+ * so we need to handle that legacy situation. */
+ if (pcr_bank == UINT16_MAX) {
+ r = tpm2_get_best_pcr_bank(c, hash_pcr_mask|pubkey_pcr_mask, &pcr_bank);
+ if (r < 0)
+ return r;
+ }
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *primary_handle = NULL;
+ if (srk_buf) {
+ r = tpm2_deserialize(c, srk_buf, srk_buf_size, &primary_handle);
+ if (r < 0)
+ return r;
+ } else if (primary_alg != 0) {
+ TPM2B_PUBLIC template = {
+ .size = sizeof(TPMT_PUBLIC),
+ };
+ r = tpm2_get_legacy_template(primary_alg, &template.publicArea);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get legacy template: %m");
+
+ r = tpm2_create_primary(
+ c,
+ /* session= */ NULL,
+ &template,
+ /* sensitive= */ NULL,
+ /* ret_public= */ NULL,
+ &primary_handle);
+ if (r < 0)
+ return r;
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "No SRK or primary alg provided.");
+
+ if (seed.size > 0) {
+ /* This is a calculated (or duplicated) sealed object, and must be imported. */
+ _cleanup_free_ TPM2B_PRIVATE *imported_private = NULL;
+ r = tpm2_import(c,
+ primary_handle,
+ /* session= */ NULL,
+ &public,
+ &private,
+ &seed,
+ /* encryption_key= */ NULL,
+ /* symmetric= */ NULL,
+ &imported_private);
+ if (r < 0)
+ return r;
+
+ private = *imported_private;
+ }
+
+ log_debug("Loading HMAC key into TPM.");
+
+ /*
+ * Nothing sensitive on the bus, no need for encryption. Even if an attacker
+ * gives you back a different key, the session initiation will fail. In the
+ * SRK model, the tpmKey is verified. In the non-srk model, with pin, the bindKey
+ * provides protections.
+ */
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *hmac_key = NULL;
+ r = tpm2_load(c, primary_handle, NULL, &public, &private, &hmac_key);
+ if (r < 0)
+ return r;
+
+ TPM2B_PUBLIC pubkey_tpm2b;
+ _cleanup_free_ void *fp = NULL;
+ size_t fp_size = 0;
+ if (pubkey) {
+ r = tpm2_tpm2b_public_from_pem(pubkey, pubkey_size, &pubkey_tpm2b);
+ if (r < 0)
+ return log_debug_errno(r, "Could not create TPMT_PUBLIC: %m");
+
+ r = tpm2_tpm2b_public_to_fingerprint(&pubkey_tpm2b, &fp, &fp_size);
+ if (r < 0)
+ return log_debug_errno(r, "Could not get key fingerprint: %m");
+ }
+
+ /*
+ * if a pin is set for the seal object, use it to bind the session
+ * key to that object. This prevents active bus interposers from
+ * faking a TPM and seeing the unsealed value. An active interposer
+ * could fake a TPM, satisfying the encrypted session, and just
+ * forward everything to the *real* TPM.
+ */
+ r = tpm2_set_auth(c, hmac_key, pin);
+ if (r < 0)
+ return r;
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *encryption_session = NULL;
+ r = tpm2_make_encryption_session(c, primary_handle, hmac_key, &encryption_session);
+ if (r < 0)
+ return r;
+
+ _cleanup_(Esys_Freep) TPM2B_SENSITIVE_DATA* unsealed = NULL;
+ for (unsigned i = RETRY_UNSEAL_MAX;; i--) {
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *policy_session = NULL;
+ _cleanup_(Esys_Freep) TPM2B_DIGEST *policy_digest = NULL;
+ r = tpm2_make_policy_session(
+ c,
+ primary_handle,
+ encryption_session,
+ &policy_session);
+ if (r < 0)
+ return r;
+
+ r = tpm2_build_sealing_policy(
+ c,
+ policy_session,
+ hash_pcr_mask,
+ pcr_bank,
+ pubkey ? &pubkey_tpm2b : NULL,
+ fp, fp_size,
+ pubkey_pcr_mask,
+ signature,
+ !!pin,
+ pcrlock_policy,
+ &policy_digest);
+ if (r < 0)
+ return r;
+
+ /* If we know the policy hash to expect, and it doesn't match, we can shortcut things here, and not
+ * wait until the TPM2 tells us to go away. */
+ if (known_policy_hash_size > 0 &&
+ memcmp_nn(policy_digest->buffer, policy_digest->size, known_policy_hash, known_policy_hash_size) != 0) {
+
+#if HAVE_OPENSSL
+ if (pubkey_size > 0 &&
+ pubkey_tpm2b.publicArea.type == TPM2_ALG_RSA &&
+ pubkey_tpm2b.publicArea.parameters.rsaDetail.exponent == TPM2_RSA_DEFAULT_EXPONENT) {
+ /* Due to bug #30546, if using RSA pubkey with the default exponent, we may
+ * need to set the exponent to the TPM special-case value of 0 and retry. */
+ log_debug("Policy hash mismatch, retrying with RSA pubkey exponent set to 0.");
+ pubkey_tpm2b.publicArea.parameters.rsaDetail.exponent = 0;
+ continue;
+ } else
+#endif
+ return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
+ "Current policy digest does not match stored policy digest, cancelling "
+ "TPM2 authentication attempt.");
+ }
+
+ log_debug("Unsealing HMAC key.");
+
+ rc = sym_Esys_Unseal(
+ c->esys_context,
+ hmac_key->esys_handle,
+ policy_session->esys_handle,
+ encryption_session->esys_handle, /* use HMAC session to enable parameter encryption */
+ ESYS_TR_NONE,
+ &unsealed);
+ if (rc == TSS2_RC_SUCCESS)
+ break;
+ if (rc != TPM2_RC_PCR_CHANGED || i == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unseal HMAC key in TPM: %s", sym_Tss2_RC_Decode(rc));
+ log_debug("A PCR value changed during the TPM2 policy session, restarting HMAC key unsealing (%u tries left).", i);
+ }
+
+ _cleanup_(erase_and_freep) char *secret = NULL;
+ secret = memdup(unsealed->buffer, unsealed->size);
+ explicit_bzero_safe(unsealed->buffer, unsealed->size);
+ if (!secret)
+ return log_oom_debug();
+
+ if (DEBUG_LOGGING)
+ log_debug("Completed TPM2 key unsealing in %s.", FORMAT_TIMESPAN(now(CLOCK_MONOTONIC) - start, 1));
+
+ *ret_secret = TAKE_PTR(secret);
+ *ret_secret_size = unsealed->size;
+
+ return 0;
+}
+
+static TPM2_HANDLE generate_random_nv_index(void) {
+ return TPM2_NV_INDEX_FIRST + (TPM2_HANDLE) random_u64_range(TPM2_NV_INDEX_LAST - TPM2_NV_INDEX_FIRST + 1);
+}
+
+int tpm2_define_policy_nv_index(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPM2_HANDLE requested_nv_index,
+ const TPM2B_DIGEST *write_policy,
+ const char *pin,
+ const TPM2B_AUTH *auth,
+ TPM2_HANDLE *ret_nv_index,
+ Tpm2Handle **ret_nv_handle,
+ TPM2B_NV_PUBLIC *ret_nv_public) {
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *new_handle = NULL;
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(pin || auth);
+
+ r = tpm2_handle_new(c, &new_handle);
+ if (r < 0)
+ return r;
+
+ new_handle->flush = false; /* This is a persistent NV index, don't flush hence */
+
+ TPM2B_AUTH _auth = {};
+ CLEANUP_ERASE(_auth);
+
+ if (!auth) {
+ r = tpm2_get_pin_auth(TPM2_ALG_SHA256, pin, &_auth);
+ if (r < 0)
+ return r;
+
+ auth = &_auth;
+ }
+
+ for (unsigned try = 0; try < 25U; try++) {
+ TPM2_HANDLE nv_index;
+
+ if (requested_nv_index != 0)
+ nv_index = requested_nv_index;
+ else
+ nv_index = generate_random_nv_index();
+
+ TPM2B_NV_PUBLIC public_info = {
+ .size = sizeof_field(TPM2B_NV_PUBLIC, nvPublic),
+ .nvPublic = {
+ .nvIndex = nv_index,
+ .nameAlg = TPM2_ALG_SHA256,
+ .attributes = TPM2_NT_ORDINARY | TPMA_NV_WRITEALL | TPMA_NV_POLICYWRITE | TPMA_NV_OWNERREAD,
+ .dataSize = offsetof(TPMT_HA, digest) + tpm2_hash_alg_to_size(TPM2_ALG_SHA256),
+ },
+ };
+
+ if (write_policy)
+ public_info.nvPublic.authPolicy = *write_policy;
+
+ rc = sym_Esys_NV_DefineSpace(
+ c->esys_context,
+ /* authHandle= */ ESYS_TR_RH_OWNER,
+ /* shandle1= */ session ? session->esys_handle : ESYS_TR_PASSWORD,
+ /* shandle2= */ ESYS_TR_NONE,
+ /* shandle3= */ ESYS_TR_NONE,
+ auth,
+ &public_info,
+ &new_handle->esys_handle);
+
+ if (rc == TSS2_RC_SUCCESS) {
+ log_debug("NV Index 0x%" PRIx32 " successfully allocated.", nv_index);
+
+ if (ret_nv_index)
+ *ret_nv_index = nv_index;
+
+ if (ret_nv_handle)
+ *ret_nv_handle = TAKE_PTR(new_handle);
+
+ if (ret_nv_public)
+ *ret_nv_public = public_info;
+
+ return 0;
+ }
+ if (rc != TPM2_RC_NV_DEFINED)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to allocate NV index: %s", sym_Tss2_RC_Decode(rc));
+
+ if (requested_nv_index != 0) {
+ assert(nv_index == requested_nv_index);
+ return log_debug_errno(SYNTHETIC_ERRNO(EEXIST),
+ "Requested NV index 0x%" PRIx32 " already taken.", requested_nv_index);
+ }
+
+ log_debug("NV index 0x%" PRIu32 " already taken, trying another one (%u tries left)", nv_index, try);
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Too many attempts trying to allocate NV index: %s", sym_Tss2_RC_Decode(rc));
+}
+
+int tpm2_write_policy_nv_index(
+ Tpm2Context *c,
+ const Tpm2Handle *policy_session,
+ TPM2_HANDLE nv_index,
+ const Tpm2Handle *nv_handle,
+ const TPM2B_DIGEST *policy_digest) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(policy_session);
+ assert(nv_handle);
+ assert(policy_digest);
+
+ if (policy_digest->size != tpm2_hash_alg_to_size(TPM2_ALG_SHA256))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Policy to store in NV index has wrong size.");
+
+ TPMT_HA ha = {
+ .hashAlg = TPM2_ALG_SHA256,
+ };
+ assert(policy_digest->size <= sizeof_field(TPMT_HA, digest));
+ memcpy_safe(&ha.digest, policy_digest->buffer, policy_digest->size);
+
+ TPM2B_MAX_NV_BUFFER buffer = {};
+ size_t written = 0;
+ rc = sym_Tss2_MU_TPMT_HA_Marshal(&ha, buffer.buffer, sizeof(buffer.buffer), &written);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to marshal policy digest.");
+
+ buffer.size = written;
+
+ rc = sym_Esys_NV_Write(
+ c->esys_context,
+ /* authHandle= */ nv_handle->esys_handle,
+ /* nvIndex= */ nv_handle->esys_handle,
+ /* shandle1= */ policy_session->esys_handle,
+ /* shandle2= */ ESYS_TR_NONE,
+ /* shandle3= */ ESYS_TR_NONE,
+ &buffer,
+ /* offset= */ 0);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to write NV index: %s", sym_Tss2_RC_Decode(rc));
+
+ if (DEBUG_LOGGING) {
+ _cleanup_free_ char *h = NULL;
+ h = hexmem(policy_digest->buffer, policy_digest->size);
+ log_debug("Written policy digest %s to NV index 0x%x", strnull(h), nv_index);
+ }
+
+ return 0;
+}
+
+int tpm2_undefine_policy_nv_index(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ TPM2_HANDLE nv_index,
+ const Tpm2Handle *nv_handle) {
+
+ TSS2_RC rc;
+
+ assert(c);
+ assert(nv_handle);
+
+ rc = sym_Esys_NV_UndefineSpace(
+ c->esys_context,
+ /* authHandle= */ ESYS_TR_RH_OWNER,
+ /* nvIndex= */ nv_handle->esys_handle,
+ /* shandle1= */ session ? session->esys_handle : ESYS_TR_NONE,
+ /* shandle2= */ ESYS_TR_NONE,
+ /* shandle3= */ ESYS_TR_NONE);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to undefine NV index: %s", sym_Tss2_RC_Decode(rc));
+
+ log_debug("Undefined NV index 0x%x", nv_index);
+ return 0;
+}
+
+int tpm2_seal_data(
+ Tpm2Context *c,
+ const struct iovec *data,
+ const Tpm2Handle *primary_handle,
+ const Tpm2Handle *encryption_session,
+ const TPM2B_DIGEST *policy,
+ struct iovec *ret_public,
+ struct iovec *ret_private) {
+
+ int r;
+
+ assert(c);
+ assert(data);
+ assert(primary_handle);
+
+ /* This is a generic version of tpm2_seal(), that doesn't imply any policy or any specific
+ * combination of the two keypairs in their marshalling. tpm2_seal() is somewhat specific to the FDE
+ * usecase. We probably should migrate tpm2_seal() to use tpm2_seal_data() eventually. */
+
+ if (data->iov_len >= sizeof_field(TPMS_SENSITIVE_CREATE, data.buffer))
+ return -E2BIG;
+
+ TPMT_PUBLIC hmac_template = {
+ .type = TPM2_ALG_KEYEDHASH,
+ .nameAlg = TPM2_ALG_SHA256,
+ .objectAttributes = TPMA_OBJECT_FIXEDTPM | TPMA_OBJECT_FIXEDPARENT,
+ .parameters.keyedHashDetail.scheme.scheme = TPM2_ALG_NULL,
+ .unique.keyedHash.size = data->iov_len,
+ .authPolicy = policy ? *policy : TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE),
+ };
+
+ TPMS_SENSITIVE_CREATE hmac_sensitive = {
+ .data.size = hmac_template.unique.keyedHash.size,
+ };
+
+ CLEANUP_ERASE(hmac_sensitive);
+
+ memcpy_safe(hmac_sensitive.data.buffer, data->iov_base, data->iov_len);
+
+ _cleanup_(Esys_Freep) TPM2B_PUBLIC *public = NULL;
+ _cleanup_(Esys_Freep) TPM2B_PRIVATE *private = NULL;
+ r = tpm2_create(c, primary_handle, encryption_session, &hmac_template, &hmac_sensitive, &public, &private);
+ if (r < 0)
+ return r;
+
+ _cleanup_(iovec_done) struct iovec public_blob = {}, private_blob = {};
+
+ r = tpm2_marshal_private(private, &private_blob.iov_base, &private_blob.iov_len);
+ if (r < 0)
+ return r;
+
+ r = tpm2_marshal_public(public, &public_blob.iov_base, &public_blob.iov_len);
+ if (r < 0)
+ return r;
+
+ if (ret_public)
+ *ret_public = TAKE_STRUCT(public_blob);
+ if (ret_private)
+ *ret_private = TAKE_STRUCT(private_blob);
+
+ return 0;
+}
+
+int tpm2_unseal_data(
+ Tpm2Context *c,
+ const struct iovec *public_blob,
+ const struct iovec *private_blob,
+ const Tpm2Handle *primary_handle,
+ const Tpm2Handle *policy_session,
+ const Tpm2Handle *encryption_session,
+ struct iovec *ret_data) {
+
+ TSS2_RC rc;
+ int r;
+
+ assert(c);
+ assert(public_blob);
+ assert(private_blob);
+ assert(primary_handle);
+
+ TPM2B_PUBLIC public;
+ r = tpm2_unmarshal_public(public_blob->iov_base, public_blob->iov_len, &public);
+ if (r < 0)
+ return r;
+
+ TPM2B_PRIVATE private;
+ r = tpm2_unmarshal_private(private_blob->iov_base, private_blob->iov_len, &private);
+ if (r < 0)
+ return r;
+
+ _cleanup_(tpm2_handle_freep) Tpm2Handle *what = NULL;
+ r = tpm2_load(c, primary_handle, NULL, &public, &private, &what);
+ if (r < 0)
+ return r;
+
+ _cleanup_(Esys_Freep) TPM2B_SENSITIVE_DATA* unsealed = NULL;
+ rc = sym_Esys_Unseal(
+ c->esys_context,
+ what->esys_handle,
+ policy_session ? policy_session->esys_handle : ESYS_TR_NONE,
+ encryption_session ? encryption_session->esys_handle : ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &unsealed);
+ if (rc == TPM2_RC_PCR_CHANGED)
+ return log_debug_errno(SYNTHETIC_ERRNO(ESTALE),
+ "PCR changed while unsealing.");
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to unseal data: %s", sym_Tss2_RC_Decode(rc));
+
+ _cleanup_(iovec_done) struct iovec d = {};
+ d = (struct iovec) {
+ .iov_base = memdup(unsealed->buffer, unsealed->size),
+ .iov_len = unsealed->size,
+ };
+
+ explicit_bzero_safe(unsealed->buffer, unsealed->size);
+
+ if (!d.iov_base)
+ return log_oom_debug();
+
+ *ret_data = TAKE_STRUCT(d);
+ return 0;
+}
+#endif /* HAVE_TPM2 */
+
+int tpm2_list_devices(void) {
+#if HAVE_TPM2
+ _cleanup_(table_unrefp) Table *t = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ int r;
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_error_errno(r, "TPM2 support is not installed.");
+
+ t = table_new("path", "device", "driver");
+ if (!t)
+ return log_oom();
+
+ d = opendir("/sys/class/tpmrm");
+ if (!d) {
+ log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR, errno, "Failed to open /sys/class/tpmrm: %m");
+ if (errno != ENOENT)
+ return -errno;
+ } else {
+ for (;;) {
+ _cleanup_free_ char *device_path = NULL, *device = NULL, *driver_path = NULL, *driver = NULL, *node = NULL;
+ struct dirent *de;
+
+ de = readdir_no_dot(d);
+ if (!de)
+ break;
+
+ device_path = path_join("/sys/class/tpmrm", de->d_name, "device");
+ if (!device_path)
+ return log_oom();
+
+ r = readlink_malloc(device_path, &device);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read device symlink %s, ignoring: %m", device_path);
+ else {
+ driver_path = path_join(device_path, "driver");
+ if (!driver_path)
+ return log_oom();
+
+ r = readlink_malloc(driver_path, &driver);
+ if (r < 0)
+ log_debug_errno(r, "Failed to read driver symlink %s, ignoring: %m", driver_path);
+ }
+
+ node = path_join("/dev", de->d_name);
+ if (!node)
+ return log_oom();
+
+ r = table_add_many(
+ t,
+ TABLE_PATH, node,
+ TABLE_STRING, device ? last_path_component(device) : NULL,
+ TABLE_STRING, driver ? last_path_component(driver) : NULL);
+ if (r < 0)
+ return table_log_add_error(r);
+ }
+ }
+
+ if (table_get_rows(t) <= 1) {
+ log_info("No suitable TPM2 devices found.");
+ return 0;
+ }
+
+ r = table_print(t, stdout);
+ if (r < 0)
+ return log_error_errno(r, "Failed to show device table: %m");
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM2 not supported on this build.");
+#endif
+}
+
+int tpm2_find_device_auto(char **ret) {
+#if HAVE_TPM2
+ _cleanup_closedir_ DIR *d = NULL;
+ int r;
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support is not installed.");
+
+ d = opendir("/sys/class/tpmrm");
+ if (!d) {
+ log_debug_errno(errno, "Failed to open /sys/class/tpmrm: %m");
+ if (errno != ENOENT)
+ return -errno;
+ } else {
+ _cleanup_free_ char *node = NULL;
+
+ for (;;) {
+ struct dirent *de;
+
+ de = readdir_no_dot(d);
+ if (!de)
+ break;
+
+ if (node)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTUNIQ),
+ "More than one TPM2 (tpmrm) device found.");
+
+ node = path_join("/dev", de->d_name);
+ if (!node)
+ return log_oom_debug();
+ }
+
+ if (node) {
+ *ret = TAKE_PTR(node);
+ return 0;
+ }
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(ENODEV), "No TPM2 (tpmrm) device found.");
+#else
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
+ "TPM2 not supported on this build.");
+#endif
+}
+
+#if HAVE_TPM2
+static const char* tpm2_userspace_event_type_table[_TPM2_USERSPACE_EVENT_TYPE_MAX] = {
+ [TPM2_EVENT_PHASE] = "phase",
+ [TPM2_EVENT_FILESYSTEM] = "filesystem",
+ [TPM2_EVENT_VOLUME_KEY] = "volume-key",
+ [TPM2_EVENT_MACHINE_ID] = "machine-id",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(tpm2_userspace_event_type, Tpm2UserspaceEventType);
+
+const char *tpm2_userspace_log_path(void) {
+ return secure_getenv("SYSTEMD_MEASURE_LOG_USERSPACE") ?: "/run/log/systemd/tpm2-measure.log";
+}
+
+const char *tpm2_firmware_log_path(void) {
+ return secure_getenv("SYSTEMD_MEASURE_LOG_FIRMWARE") ?: "/sys/kernel/security/tpm0/binary_bios_measurements";
+}
+
+#if HAVE_OPENSSL
+static int tpm2_userspace_log_open(void) {
+ _cleanup_close_ int fd = -EBADF;
+ struct stat st;
+ const char *e;
+ int r;
+
+ e = tpm2_userspace_log_path();
+ (void) mkdir_parents(e, 0755);
+
+ /* We use access mode 0600 here (even though the measurements should not strictly be confidential),
+ * because we use BSD file locking on it, and if anyone but root can access the file they can also
+ * lock it, which we want to avoid. */
+ fd = open(e, O_CREAT|O_WRONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0600);
+ if (fd < 0)
+ return log_debug_errno(errno, "Failed to open TPM log file '%s' for writing, ignoring: %m", e);
+
+ if (flock(fd, LOCK_EX) < 0)
+ return log_debug_errno(errno, "Failed to lock TPM log file '%s', ignoring: %m", e);
+
+ if (fstat(fd, &st) < 0)
+ return log_debug_errno(errno, "Failed to fstat TPM log file '%s', ignoring: %m", e);
+
+ r = stat_verify_regular(&st);
+ if (r < 0)
+ return log_debug_errno(r, "TPM log file '%s' is not regular, ignoring: %m", e);
+
+ /* We set the sticky bit when we are about to append to the log file. We'll unset it afterwards
+ * again. If we manage to take a lock on a file that has it set we know we didn't write it fully and
+ * it is corrupted. Ideally we'd like to use user xattrs for this, but unfortunately tmpfs (which is
+ * our assumed backend fs) doesn't know user xattrs. */
+ if (st.st_mode & S_ISVTX)
+ return log_debug_errno(SYNTHETIC_ERRNO(ESTALE), "TPM log file '%s' aborted, ignoring.", e);
+
+ if (fchmod(fd, 0600 | S_ISVTX) < 0)
+ return log_debug_errno(errno, "Failed to chmod() TPM log file '%s', ignoring: %m", e);
+
+ return TAKE_FD(fd);
+}
+
+static int tpm2_userspace_log(
+ int fd,
+ unsigned pcr_index,
+ const TPML_DIGEST_VALUES *values,
+ Tpm2UserspaceEventType event_type,
+ const char *description) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *array = NULL;
+ _cleanup_free_ char *f = NULL;
+ sd_id128_t boot_id;
+ int r;
+
+ assert(values);
+ assert(values->count > 0);
+
+ /* We maintain a local PCR measurement log. This implements a subset of the TCG Canonical Event Log
+ * Format – the JSON flavour –
+ * (https://trustedcomputinggroup.org/resource/canonical-event-log-format/), but departs in certain
+ * ways from it, specifically:
+ *
+ * - We don't write out a recnum. It's a bit too vaguely defined which means we'd have to read
+ * through the whole logs (include firmware logs) before knowing what the next value is we should
+ * use. Hence we simply don't write this out as append-time, and instead expect a consumer to add
+ * it in when it uses the data.
+ *
+ * - We write this out in RFC 7464 application/json-seq rather than as a JSON array. Writing this as
+ * JSON array would mean that for each appending we'd have to read the whole log file fully into
+ * memory before writing it out again. We prefer a strictly append-only write pattern however. (RFC
+ * 7464 is what jq --seq eats.) Conversion into a proper JSON array is trivial.
+ *
+ * It should be possible to convert this format in a relatively straight-forward way into the
+ * official TCG Canonical Event Log Format on read, by simply adding in a few more fields that can be
+ * determined from the full dataset.
+ *
+ * We set the 'content_type' field to "systemd" to make clear this data is generated by us, and
+ * include various interesting fields in the 'content' subobject, including a CLOCK_BOOTTIME
+ * timestamp which can be used to order this measurement against possibly other measurements
+ * independently done by other subsystems on the system.
+ */
+
+ if (fd < 0) /* Apparently tpm2_local_log_open() failed earlier, let's not complain again */
+ return 0;
+
+ for (size_t i = 0; i < values->count; i++) {
+ const EVP_MD *implementation;
+ const char *a;
+
+ assert_se(a = tpm2_hash_alg_to_string(values->digests[i].hashAlg));
+ assert_se(implementation = EVP_get_digestbyname(a));
+
+ r = json_variant_append_arrayb(
+ &array, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_STRING("hashAlg", a),
+ JSON_BUILD_PAIR("digest", JSON_BUILD_HEX(&values->digests[i].digest, EVP_MD_size(implementation)))));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to append digest object to JSON array: %m");
+ }
+
+ assert(array);
+
+ r = sd_id128_get_boot(&boot_id);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to acquire boot ID: %m");
+
+ r = json_build(&v, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("pcr", JSON_BUILD_UNSIGNED(pcr_index)),
+ JSON_BUILD_PAIR("digests", JSON_BUILD_VARIANT(array)),
+ JSON_BUILD_PAIR("content_type", JSON_BUILD_STRING("systemd")),
+ JSON_BUILD_PAIR("content", JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_CONDITION(description, "string", JSON_BUILD_STRING(description)),
+ JSON_BUILD_PAIR("bootId", JSON_BUILD_ID128(boot_id)),
+ JSON_BUILD_PAIR("timestamp", JSON_BUILD_UNSIGNED(now(CLOCK_BOOTTIME))),
+ JSON_BUILD_PAIR_CONDITION(event_type >= 0, "eventType", JSON_BUILD_STRING(tpm2_userspace_event_type_to_string(event_type)))))));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to build log record JSON: %m");
+
+ r = json_variant_format(v, JSON_FORMAT_SEQ, &f);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to format JSON: %m");
+
+ if (lseek(fd, 0, SEEK_END) < 0)
+ return log_debug_errno(errno, "Failed to seek to end of JSON log: %m");
+
+ r = loop_write(fd, f, SIZE_MAX);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to write JSON data to log: %m");
+
+ if (fsync(fd) < 0)
+ return log_debug_errno(errno, "Failed to sync JSON data: %m");
+
+ /* Unset S_ISVTX again */
+ if (fchmod(fd, 0600) < 0)
+ return log_debug_errno(errno, "Failed to chmod() TPM log file, ignoring: %m");
+
+ r = fsync_full(fd);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to sync JSON log: %m");
+
+ return 1;
+}
+#endif
+
+int tpm2_extend_bytes(
+ Tpm2Context *c,
+ char **banks,
+ unsigned pcr_index,
+ const void *data,
+ size_t data_size,
+ const void *secret,
+ size_t secret_size,
+ Tpm2UserspaceEventType event_type,
+ const char *description) {
+
+#if HAVE_OPENSSL
+ _cleanup_close_ int log_fd = -EBADF;
+ TPML_DIGEST_VALUES values = {};
+ TSS2_RC rc;
+
+ assert(c);
+ assert(data || data_size == 0);
+ assert(secret || secret_size == 0);
+
+ if (data_size == SIZE_MAX)
+ data_size = strlen(data);
+ if (secret_size == SIZE_MAX)
+ secret_size = strlen(secret);
+
+ if (pcr_index >= TPM2_PCRS_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Can't measure into unsupported PCR %u, refusing.", pcr_index);
+
+ if (strv_isempty(banks))
+ return 0;
+
+ STRV_FOREACH(bank, banks) {
+ const EVP_MD *implementation;
+ int id;
+
+ assert_se(implementation = EVP_get_digestbyname(*bank));
+
+ if (values.count >= ELEMENTSOF(values.digests))
+ return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "Too many banks selected.");
+
+ if ((size_t) EVP_MD_size(implementation) > sizeof(values.digests[values.count].digest))
+ return log_debug_errno(SYNTHETIC_ERRNO(E2BIG), "Hash result too large for TPM2.");
+
+ id = tpm2_hash_alg_from_string(EVP_MD_name(implementation));
+ if (id < 0)
+ return log_debug_errno(id, "Can't map hash name to TPM2.");
+
+ values.digests[values.count].hashAlg = id;
+
+ /* So here's a twist: sometimes we want to measure secrets (e.g. root file system volume
+ * key), but we'd rather not leak a literal hash of the secret to the TPM (given that the
+ * wire is unprotected, and some other subsystem might use the simple, literal hash of the
+ * secret for other purposes, maybe because it needs a shorter secret derived from it for
+ * some unrelated purpose, who knows). Hence we instead measure an HMAC signature of a
+ * private non-secret string instead. */
+ if (secret_size > 0) {
+ if (!HMAC(implementation, secret, secret_size, data, data_size, (unsigned char*) &values.digests[values.count].digest, NULL))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to calculate HMAC of data to measure.");
+ } else if (EVP_Digest(data, data_size, (unsigned char*) &values.digests[values.count].digest, NULL, implementation, NULL) != 1)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to hash data to measure.");
+
+ values.count++;
+ }
+
+ /* Open + lock the log file *before* we start measuring, so that no one else can come between our log
+ * and our measurement and change either */
+ log_fd = tpm2_userspace_log_open();
+
+ rc = sym_Esys_PCR_Extend(
+ c->esys_context,
+ ESYS_TR_PCR0 + pcr_index,
+ ESYS_TR_PASSWORD,
+ ESYS_TR_NONE,
+ ESYS_TR_NONE,
+ &values);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_debug_errno(
+ SYNTHETIC_ERRNO(ENOTRECOVERABLE),
+ "Failed to measure into PCR %u: %s",
+ pcr_index,
+ sym_Tss2_RC_Decode(rc));
+
+ /* Now, write what we just extended to the log, too. */
+ (void) tpm2_userspace_log(log_fd, pcr_index, &values, event_type, description);
+
+ return 0;
+#else /* HAVE_OPENSSL */
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "OpenSSL support is disabled.");
+#endif
+}
+
+const uint16_t tpm2_hash_algorithms[] = {
+ TPM2_ALG_SHA1,
+ TPM2_ALG_SHA256,
+ TPM2_ALG_SHA384,
+ TPM2_ALG_SHA512,
+ 0,
+};
+
+assert_cc(ELEMENTSOF(tpm2_hash_algorithms) == TPM2_N_HASH_ALGORITHMS + 1);
+
+static size_t tpm2_hash_algorithm_index(uint16_t algorithm) {
+ for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++)
+ if (tpm2_hash_algorithms[i] == algorithm)
+ return i;
+
+ return SIZE_MAX;
+}
+
+TPM2B_DIGEST *tpm2_pcr_prediction_result_get_hash(Tpm2PCRPredictionResult *result, uint16_t alg) {
+ size_t alg_idx;
+
+ assert(result);
+
+ alg_idx = tpm2_hash_algorithm_index(alg);
+ if (alg_idx == SIZE_MAX) /* Algorithm not known? */
+ return NULL;
+
+ if (result->hash[alg_idx].size <= 0) /* No hash value for this algorithm? */
+ return NULL;
+
+ return result->hash + alg_idx;
+}
+
+void tpm2_pcr_prediction_done(Tpm2PCRPrediction *p) {
+ assert(p);
+
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++)
+ ordered_set_free(p->results[pcr]);
+}
+
+static void tpm2_pcr_prediction_result_hash_func(const Tpm2PCRPredictionResult *banks, struct siphash *state) {
+ assert(banks);
+
+ for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++)
+ siphash24_compress_safe(banks->hash[i].buffer, banks->hash[i].size, state);
+}
+
+static int tpm2_pcr_prediction_result_compare_func(const Tpm2PCRPredictionResult *a, const Tpm2PCRPredictionResult *b) {
+ int r;
+
+ assert(a);
+ assert(b);
+
+ for (size_t i = 0; i < TPM2_N_HASH_ALGORITHMS; i++) {
+ r = memcmp_nn(a->hash[i].buffer, a->hash[i].size,
+ b->hash[i].buffer, b->hash[i].size);
+ if (r != 0)
+ return r;
+ }
+
+ return 0;
+}
+
+DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
+ tpm2_pcr_prediction_result_hash_ops,
+ Tpm2PCRPredictionResult,
+ tpm2_pcr_prediction_result_hash_func,
+ tpm2_pcr_prediction_result_compare_func,
+ Tpm2PCRPredictionResult,
+ free);
+
+static Tpm2PCRPredictionResult *find_prediction_result_by_algorithm(OrderedSet *set, Tpm2PCRPredictionResult *result, size_t alg_idx) {
+ Tpm2PCRPredictionResult *f;
+
+ assert(result);
+ assert(alg_idx != SIZE_MAX);
+
+ f = ordered_set_get(set, result); /* Full match? */
+ if (f)
+ return f;
+
+ /* If this doesn't match full, then see if there an entry that at least matches by the relevant
+ * algorithm (we are fine if predictions are "incomplete" in some algorithms) */
+
+ ORDERED_SET_FOREACH(f, set)
+ if (memcmp_nn(result->hash[alg_idx].buffer, result->hash[alg_idx].size,
+ f->hash[alg_idx].buffer, f->hash[alg_idx].size) == 0)
+ return f;
+
+ return NULL;
+}
+
+bool tpm2_pcr_prediction_equal(
+ Tpm2PCRPrediction *a,
+ Tpm2PCRPrediction *b,
+ uint16_t algorithm) {
+
+ if (a == b)
+ return true;
+ if (!a || !b)
+ return false;
+
+ if (a->pcrs != b->pcrs)
+ return false;
+
+ size_t alg_idx = tpm2_hash_algorithm_index(algorithm);
+ if (alg_idx == SIZE_MAX)
+ return false;
+
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) {
+ Tpm2PCRPredictionResult *banks;
+
+ ORDERED_SET_FOREACH(banks, a->results[pcr])
+ if (!find_prediction_result_by_algorithm(b->results[pcr], banks, alg_idx))
+ return false;
+
+ ORDERED_SET_FOREACH(banks, b->results[pcr])
+ if (!find_prediction_result_by_algorithm(a->results[pcr], banks, alg_idx))
+ return false;
+ }
+
+ return true;
+}
+
+int tpm2_pcr_prediction_to_json(
+ const Tpm2PCRPrediction *prediction,
+ uint16_t algorithm,
+ JsonVariant **ret) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *aj = NULL;
+ int r;
+
+ assert(prediction);
+ assert(ret);
+
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *vj = NULL;
+ Tpm2PCRPredictionResult *banks;
+
+ if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr))
+ continue;
+
+ ORDERED_SET_FOREACH(banks, prediction->results[pcr]) {
+
+ TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm);
+ if (!hash)
+ continue;
+
+ r = json_variant_append_arrayb(
+ &vj,
+ JSON_BUILD_HEX(hash->buffer, hash->size));
+ if (r < 0)
+ return log_error_errno(r, "Failed to append hash variant to JSON array: %m");
+ }
+
+ if (!vj)
+ continue;
+
+ r = json_variant_append_arrayb(
+ &aj,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_INTEGER("pcr", pcr),
+ JSON_BUILD_PAIR_VARIANT("values", vj)));
+ if (r < 0)
+ return log_error_errno(r, "Failed to append PCR variants to JSON array: %m");
+ }
+
+ if (!aj) {
+ r = json_variant_new_array(&aj, NULL, 0);
+ if (r < 0)
+ return r;
+ }
+
+ *ret = TAKE_PTR(aj);
+ return 0;
+}
+
+int tpm2_pcr_prediction_from_json(
+ Tpm2PCRPrediction *prediction,
+ uint16_t algorithm,
+ JsonVariant *aj) {
+
+ int r;
+
+ assert(prediction);
+
+ size_t alg_index = tpm2_hash_algorithm_index(algorithm);
+ assert(alg_index < TPM2_N_HASH_ALGORITHMS);
+
+ if (!json_variant_is_array(aj))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR variant array is not an array.");
+
+ JsonVariant *pcr;
+ JSON_VARIANT_ARRAY_FOREACH(pcr, aj) {
+ JsonVariant *nr, *values;
+
+ nr = json_variant_by_key(pcr, "pcr");
+ if (!nr)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry lacks PCR index field");
+
+ if (!json_variant_is_unsigned(nr) ||
+ json_variant_unsigned(nr) >= TPM2_PCRS_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry PCR index is not an integer in the range 0…23");
+
+ values = json_variant_by_key(pcr, "values");
+ if (!values)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry lacks values field");
+
+ if (!json_variant_is_array(values))
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "PCR array entry values field is not an array");
+
+ prediction->pcrs |= UINT32_C(1) << json_variant_unsigned(nr);
+
+ JsonVariant *v;
+ JSON_VARIANT_ARRAY_FOREACH(v, values) {
+ _cleanup_free_ void *buffer = NULL;
+ size_t size;
+
+ r = json_variant_unhex(v, &buffer, &size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to decode PCR policy array hash value");
+
+ if (size <= 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "PCR policy array hash value is zero.");
+
+ if (size > sizeof_field(TPM2B_DIGEST, buffer))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "PCR policy array hash value is too large.");
+
+ _cleanup_free_ Tpm2PCRPredictionResult *banks = new0(Tpm2PCRPredictionResult, 1);
+ if (!banks)
+ return log_oom();
+
+ memcpy(banks->hash[alg_index].buffer, buffer, size);
+ banks->hash[alg_index].size = size;
+
+ r = ordered_set_ensure_put(prediction->results + json_variant_unsigned(nr), &tpm2_pcr_prediction_result_hash_ops, banks);
+ if (r == -EEXIST) /* Let's allow duplicates */
+ continue;
+ if (r < 0)
+ return log_error_errno(r, "Failed to insert result into set: %m");
+
+ TAKE_PTR(banks);
+ }
+ }
+
+ return 0;
+}
+
+int tpm2_calculate_policy_super_pcr(
+ Tpm2PCRPrediction *prediction,
+ uint16_t algorithm,
+ TPM2B_DIGEST *pcr_policy) {
+
+ int r;
+
+ assert_se(prediction);
+ assert_se(pcr_policy);
+
+ /* Start with a zero policy if not specified otherwise. */
+ TPM2B_DIGEST super_pcr_policy_digest = *pcr_policy;
+
+ /* First we look for all PCRs that have exactly one allowed hash value, and generate a single PolicyPCR policy from them */
+ _cleanup_free_ Tpm2PCRValue *single_values = NULL;
+ size_t n_single_values = 0;
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) {
+ if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr))
+ continue;
+
+ if (ordered_set_size(prediction->results[pcr]) != 1)
+ continue;
+
+ log_debug("Including PCR %" PRIu32 " in single value PolicyPCR expression", pcr);
+
+ Tpm2PCRPredictionResult *banks = ASSERT_PTR(ordered_set_first(prediction->results[pcr]));
+
+ TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm);
+ if (!hash)
+ continue;
+
+ if (!GREEDY_REALLOC(single_values, n_single_values + 1))
+ return -ENOMEM;
+
+ single_values[n_single_values++] = TPM2_PCR_VALUE_MAKE(pcr, algorithm, *hash);
+ }
+
+ if (n_single_values > 0) {
+ /* Evolve policy based on the expected PCR value for what we found. */
+ r = tpm2_calculate_policy_pcr(
+ single_values,
+ n_single_values,
+ &super_pcr_policy_digest);
+ if (r < 0)
+ return r;
+ }
+
+ /* Now deal with the PCRs for which we have variants, i.e. more than one allowed values */
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) {
+ _cleanup_free_ TPM2B_DIGEST *pcr_policy_digest_variants = NULL;
+ size_t n_pcr_policy_digest_variants = 0;
+ Tpm2PCRPredictionResult *banks;
+
+ if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr))
+ continue;
+
+ if (ordered_set_size(prediction->results[pcr]) <= 1) /* We only care for PCRs with 2 or more variants in this loop */
+ continue;
+
+ if (ordered_set_size(prediction->results[pcr]) > 8)
+ return log_error_errno(SYNTHETIC_ERRNO(E2BIG), "PCR policies with more than 8 alternatives per PCR are currently not supported.");
+
+ ORDERED_SET_FOREACH(banks, prediction->results[pcr]) {
+ /* Start from the super PCR policy from the previous PCR we looked at so far. */
+ TPM2B_DIGEST pcr_policy_digest = super_pcr_policy_digest;
+
+ TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm);
+ if (!hash)
+ continue;
+
+ /* Evolve it based on the expected PCR value for this PCR */
+ r = tpm2_calculate_policy_pcr(
+ &TPM2_PCR_VALUE_MAKE(
+ pcr,
+ algorithm,
+ *hash),
+ /* n_pcr_values= */ 1,
+ &pcr_policy_digest);
+ if (r < 0)
+ return r;
+
+ /* Store away this new variant */
+ if (!GREEDY_REALLOC(pcr_policy_digest_variants, n_pcr_policy_digest_variants + 1))
+ return log_oom();
+
+ pcr_policy_digest_variants[n_pcr_policy_digest_variants++] = pcr_policy_digest;
+
+ log_debug("Calculated PCR policy variant %zu for PCR %" PRIu32, n_pcr_policy_digest_variants, pcr);
+ }
+
+ assert_se(n_pcr_policy_digest_variants >= 2);
+ assert_se(n_pcr_policy_digest_variants <= 8);
+
+ /* Now combine all our variant into one OR policy */
+ r = tpm2_calculate_policy_or(
+ pcr_policy_digest_variants,
+ n_pcr_policy_digest_variants,
+ &super_pcr_policy_digest);
+ if (r < 0)
+ return r;
+
+ log_debug("Combined %zu variants in OR policy.", n_pcr_policy_digest_variants);
+ }
+
+ *pcr_policy = super_pcr_policy_digest;
+ return 0;
+}
+
+int tpm2_policy_super_pcr(
+ Tpm2Context *c,
+ const Tpm2Handle *session,
+ const Tpm2PCRPrediction *prediction,
+ uint16_t algorithm) {
+
+ int r;
+
+ assert_se(c);
+ assert_se(session);
+ assert_se(prediction);
+
+ TPM2B_DIGEST previous_policy_digest = TPM2B_DIGEST_MAKE(NULL, TPM2_SHA256_DIGEST_SIZE);
+
+ uint32_t single_value_pcrs = 0;
+
+ /* Look for all PCRs that have only a singled allowed hash value, and synthesize a single PolicyPCR policy item for them */
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) {
+ if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr))
+ continue;
+
+ if (ordered_set_size(prediction->results[pcr]) != 1)
+ continue;
+
+ log_debug("Including PCR %" PRIu32 " in single value PolicyPCR expression", pcr);
+
+ single_value_pcrs |= UINT32_C(1) << pcr;
+ }
+
+ if (single_value_pcrs != 0) {
+ TPML_PCR_SELECTION pcr_selection;
+ tpm2_tpml_pcr_selection_from_mask(single_value_pcrs, algorithm, &pcr_selection);
+
+ _cleanup_free_ TPM2B_DIGEST *current_policy_digest = NULL;
+ r = tpm2_policy_pcr(
+ c,
+ session,
+ &pcr_selection,
+ &current_policy_digest);
+ if (r < 0)
+ return r;
+
+ previous_policy_digest = *current_policy_digest;
+ }
+
+ for (uint32_t pcr = 0; pcr < TPM2_PCRS_MAX; pcr++) {
+ size_t n_branches;
+
+ if (!FLAGS_SET(prediction->pcrs, UINT32_C(1) << pcr))
+ continue;
+
+ n_branches = ordered_set_size(prediction->results[pcr]);
+ if (n_branches < 1 || n_branches > 8)
+ return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Number of variants per PCR not in range 1…8");
+
+ if (n_branches == 1) /* Single choice PCRs are already covered by the loop above */
+ continue;
+
+ log_debug("Submitting PCR/OR policy for PCR %" PRIu32, pcr);
+
+ TPML_PCR_SELECTION pcr_selection;
+ tpm2_tpml_pcr_selection_from_mask(UINT32_C(1) << pcr, algorithm, &pcr_selection);
+
+ _cleanup_free_ TPM2B_DIGEST *current_policy_digest = NULL;
+ r = tpm2_policy_pcr(
+ c,
+ session,
+ &pcr_selection,
+ &current_policy_digest);
+ if (r < 0)
+ return r;
+
+ _cleanup_free_ TPM2B_DIGEST *branches = NULL;
+ branches = new0(TPM2B_DIGEST, n_branches);
+ if (!branches)
+ return log_oom();
+
+ Tpm2PCRPredictionResult *banks;
+ size_t i = 0;
+ ORDERED_SET_FOREACH(banks, prediction->results[pcr]) {
+ TPM2B_DIGEST pcr_policy_digest = previous_policy_digest;
+
+ TPM2B_DIGEST *hash = tpm2_pcr_prediction_result_get_hash(banks, algorithm);
+ if (!hash)
+ continue;
+
+ /* Evolve it based on the expected PCR value for this PCR */
+ r = tpm2_calculate_policy_pcr(
+ &TPM2_PCR_VALUE_MAKE(
+ pcr,
+ algorithm,
+ *hash),
+ /* n_pcr_values= */ 1,
+ &pcr_policy_digest);
+ if (r < 0)
+ return r;
+
+ branches[i++] = pcr_policy_digest;
+ }
+
+ assert_se(i == n_branches);
+
+ current_policy_digest = mfree(current_policy_digest);
+ r = tpm2_policy_or(
+ c,
+ session,
+ branches,
+ n_branches,
+ &current_policy_digest);
+ if (r < 0)
+ return r;
+
+ previous_policy_digest = *current_policy_digest;
+ }
+
+ return 0;
+}
+
+void tpm2_pcrlock_policy_done(Tpm2PCRLockPolicy *data) {
+ assert(data);
+
+ data->prediction_json = json_variant_unref(data->prediction_json);
+ tpm2_pcr_prediction_done(&data->prediction);
+ iovec_done(&data->nv_handle);
+ iovec_done(&data->nv_public);
+ iovec_done(&data->srk_handle);
+ iovec_done(&data->pin_public);
+ iovec_done(&data->pin_private);
+}
+
+static int json_dispatch_tpm2_algorithm(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint16_t *algorithm = ASSERT_PTR(userdata);
+ int r;
+
+ r = tpm2_hash_alg_from_string(json_variant_string(variant));
+ if (r < 0 || tpm2_hash_algorithm_index(r) == SIZE_MAX)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid hash algorithm: %s", json_variant_string(variant));
+
+ *algorithm = r;
+ return 0;
+}
+
+int tpm2_pcrlock_search_file(const char *path, FILE **ret_file, char **ret_path) {
+ static const char search[] =
+ "/run/systemd\0"
+ "/var/lib/systemd\0";
+
+ int r;
+
+ if (!path)
+ path = "pcrlock.json";
+
+ r = search_and_fopen_nulstr(path, ret_file ? "re" : NULL, NULL, search, ret_file, ret_path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to find TPM2 pcrlock policy file '%s': %m", path);
+
+ return 0;
+}
+
+int tpm2_pcrlock_policy_load(
+ const char *path,
+ Tpm2PCRLockPolicy *ret_policy) {
+
+ _cleanup_free_ char *discovered_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ r = tpm2_pcrlock_search_file(path, &f, &discovered_path);
+ if (r == -ENOENT) {
+ *ret_policy = (Tpm2PCRLockPolicy) {};
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to load TPM2 pcrlock policy file: %m");
+
+ _cleanup_(json_variant_unrefp) JsonVariant *configuration_json = NULL;
+ r = json_parse_file(
+ f,
+ discovered_path,
+ /* flags = */ 0,
+ &configuration_json,
+ /* ret_line= */ NULL,
+ /* ret_column= */ NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse existing pcrlock policy file '%s': %m", discovered_path);
+
+ JsonDispatch policy_dispatch[] = {
+ { "pcrBank", JSON_VARIANT_STRING, json_dispatch_tpm2_algorithm, offsetof(Tpm2PCRLockPolicy, algorithm), JSON_MANDATORY },
+ { "pcrValues", JSON_VARIANT_ARRAY, json_dispatch_variant, offsetof(Tpm2PCRLockPolicy, prediction_json), JSON_MANDATORY },
+ { "nvIndex", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint32, offsetof(Tpm2PCRLockPolicy, nv_index), JSON_MANDATORY },
+ { "nvHandle", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, nv_handle), JSON_MANDATORY },
+ { "nvPublic", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, nv_public), JSON_MANDATORY },
+ { "srkHandle", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, srk_handle), JSON_MANDATORY },
+ { "pinPublic", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, pin_public), JSON_MANDATORY },
+ { "pinPrivate", JSON_VARIANT_STRING, json_dispatch_unbase64_iovec, offsetof(Tpm2PCRLockPolicy, pin_private), JSON_MANDATORY },
+ {}
+ };
+
+ _cleanup_(tpm2_pcrlock_policy_done) Tpm2PCRLockPolicy policy = {};
+
+ r = json_dispatch(configuration_json, policy_dispatch, JSON_LOG, &policy);
+ if (r < 0)
+ return r;
+
+ r = tpm2_pcr_prediction_from_json(&policy.prediction, policy.algorithm, policy.prediction_json);
+ if (r < 0)
+ return r;
+
+ *ret_policy = TAKE_STRUCT(policy);
+ return 1;
+}
+
+int tpm2_load_public_key_file(const char *path, TPM2B_PUBLIC *ret) {
+ _cleanup_free_ char *device_key_buffer = NULL;
+ TPM2B_PUBLIC device_key_public = {};
+ size_t device_key_buffer_size;
+ TSS2_RC rc;
+ int r;
+
+ assert(path);
+ assert(ret);
+
+ r = dlopen_tpm2();
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 support not installed: %m");
+
+ r = read_full_file(path, &device_key_buffer, &device_key_buffer_size);
+ if (r < 0)
+ return log_error_errno(r, "Failed to read device key from file '%s': %m", path);
+
+ size_t offset = 0;
+ rc = sym_Tss2_MU_TPM2B_PUBLIC_Unmarshal(
+ (uint8_t*) device_key_buffer,
+ device_key_buffer_size,
+ &offset,
+ &device_key_public);
+ if (rc != TSS2_RC_SUCCESS)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Could not unmarshal public key from file.");
+
+ assert(offset <= device_key_buffer_size);
+ if (offset != device_key_buffer_size)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Found %zu bytes of trailing garbage in public key file.",
+ device_key_buffer_size - offset);
+
+ *ret = device_key_public;
+ return 0;
+}
+#endif
+
+char *tpm2_pcr_mask_to_string(uint32_t mask) {
+ _cleanup_free_ char *s = NULL;
+
+ FOREACH_PCR_IN_MASK(n, mask)
+ if (strextendf_with_separator(&s, "+", "%d", n) < 0)
+ return NULL;
+
+ if (!s)
+ return strdup("");
+
+ return TAKE_PTR(s);
+}
+
+int tpm2_make_pcr_json_array(uint32_t pcr_mask, JsonVariant **ret) {
+ _cleanup_(json_variant_unrefp) JsonVariant *a = NULL;
+ int r;
+
+ assert(ret);
+
+ for (size_t i = 0; i < TPM2_PCRS_MAX; i++) {
+ _cleanup_(json_variant_unrefp) JsonVariant *e = NULL;
+
+ if ((pcr_mask & (UINT32_C(1) << i)) == 0)
+ continue;
+
+ r = json_variant_new_integer(&e, i);
+ if (r < 0)
+ return r;
+
+ r = json_variant_append_array(&a, e);
+ if (r < 0)
+ return r;
+ }
+
+ if (!a)
+ return json_variant_new_array(ret, NULL, 0);
+
+ *ret = TAKE_PTR(a);
+ return 0;
+}
+
+int tpm2_parse_pcr_json_array(JsonVariant *v, uint32_t *ret) {
+ JsonVariant *e;
+ uint32_t mask = 0;
+
+ if (!json_variant_is_array(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR array is not a JSON array.");
+
+ JSON_VARIANT_ARRAY_FOREACH(e, v) {
+ uint64_t u;
+
+ if (!json_variant_is_unsigned(e))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR is not an unsigned integer.");
+
+ u = json_variant_unsigned(e);
+ if (u >= TPM2_PCRS_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR number out of range: %" PRIu64, u);
+
+ mask |= UINT32_C(1) << u;
+ }
+
+ if (ret)
+ *ret = mask;
+
+ return 0;
+}
+
+int tpm2_make_luks2_json(
+ int keyslot,
+ uint32_t hash_pcr_mask,
+ uint16_t pcr_bank,
+ const void *pubkey,
+ size_t pubkey_size,
+ uint32_t pubkey_pcr_mask,
+ uint16_t primary_alg,
+ const void *blob,
+ size_t blob_size,
+ const void *policy_hash,
+ size_t policy_hash_size,
+ const void *salt,
+ size_t salt_size,
+ const void *srk_buf,
+ size_t srk_buf_size,
+ TPM2Flags flags,
+ JsonVariant **ret) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL, *hmj = NULL, *pkmj = NULL;
+ _cleanup_free_ char *keyslot_as_string = NULL;
+ int r;
+
+ assert(blob || blob_size == 0);
+ assert(policy_hash || policy_hash_size == 0);
+ assert(pubkey || pubkey_size == 0);
+
+ if (asprintf(&keyslot_as_string, "%i", keyslot) < 0)
+ return -ENOMEM;
+
+ r = tpm2_make_pcr_json_array(hash_pcr_mask, &hmj);
+ if (r < 0)
+ return r;
+
+ if (pubkey_pcr_mask != 0) {
+ r = tpm2_make_pcr_json_array(pubkey_pcr_mask, &pkmj);
+ if (r < 0)
+ return r;
+ }
+
+ /* Note: We made the mistake of using "-" in the field names, which isn't particular compatible with
+ * other programming languages. Let's not make things worse though, i.e. future additions to the JSON
+ * object should use "_" rather than "-" in field names. */
+
+ r = json_build(&v,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("type", JSON_BUILD_CONST_STRING("systemd-tpm2")),
+ JSON_BUILD_PAIR("keyslots", JSON_BUILD_ARRAY(JSON_BUILD_STRING(keyslot_as_string))),
+ JSON_BUILD_PAIR("tpm2-blob", JSON_BUILD_BASE64(blob, blob_size)),
+ JSON_BUILD_PAIR("tpm2-pcrs", JSON_BUILD_VARIANT(hmj)),
+ JSON_BUILD_PAIR_CONDITION(!!tpm2_hash_alg_to_string(pcr_bank), "tpm2-pcr-bank", JSON_BUILD_STRING(tpm2_hash_alg_to_string(pcr_bank))),
+ JSON_BUILD_PAIR_CONDITION(!!tpm2_asym_alg_to_string(primary_alg), "tpm2-primary-alg", JSON_BUILD_STRING(tpm2_asym_alg_to_string(primary_alg))),
+ JSON_BUILD_PAIR("tpm2-policy-hash", JSON_BUILD_HEX(policy_hash, policy_hash_size)),
+ JSON_BUILD_PAIR("tpm2-pin", JSON_BUILD_BOOLEAN(flags & TPM2_FLAGS_USE_PIN)),
+ JSON_BUILD_PAIR("tpm2_pcrlock", JSON_BUILD_BOOLEAN(flags & TPM2_FLAGS_USE_PCRLOCK)),
+ JSON_BUILD_PAIR_CONDITION(pubkey_pcr_mask != 0, "tpm2_pubkey_pcrs", JSON_BUILD_VARIANT(pkmj)),
+ JSON_BUILD_PAIR_CONDITION(pubkey_pcr_mask != 0, "tpm2_pubkey", JSON_BUILD_BASE64(pubkey, pubkey_size)),
+ JSON_BUILD_PAIR_CONDITION(salt, "tpm2_salt", JSON_BUILD_BASE64(salt, salt_size)),
+ JSON_BUILD_PAIR_CONDITION(srk_buf, "tpm2_srk", JSON_BUILD_BASE64(srk_buf, srk_buf_size))));
+ if (r < 0)
+ return r;
+
+ if (ret)
+ *ret = TAKE_PTR(v);
+
+ return keyslot;
+}
+
+int tpm2_parse_luks2_json(
+ JsonVariant *v,
+ int *ret_keyslot,
+ uint32_t *ret_hash_pcr_mask,
+ uint16_t *ret_pcr_bank,
+ void **ret_pubkey,
+ size_t *ret_pubkey_size,
+ uint32_t *ret_pubkey_pcr_mask,
+ uint16_t *ret_primary_alg,
+ void **ret_blob,
+ size_t *ret_blob_size,
+ void **ret_policy_hash,
+ size_t *ret_policy_hash_size,
+ void **ret_salt,
+ size_t *ret_salt_size,
+ void **ret_srk_buf,
+ size_t *ret_srk_buf_size,
+ TPM2Flags *ret_flags) {
+
+ _cleanup_free_ void *blob = NULL, *policy_hash = NULL, *pubkey = NULL, *salt = NULL, *srk_buf = NULL;
+ size_t blob_size = 0, policy_hash_size = 0, pubkey_size = 0, salt_size = 0, srk_buf_size = 0;
+ uint32_t hash_pcr_mask = 0, pubkey_pcr_mask = 0;
+ uint16_t primary_alg = TPM2_ALG_ECC; /* ECC was the only supported algorithm in systemd < 250, use that as implied default, for compatibility */
+ uint16_t pcr_bank = UINT16_MAX; /* default: pick automatically */
+ int r, keyslot = -1;
+ TPM2Flags flags = 0;
+ JsonVariant *w;
+
+ assert(v);
+
+ if (ret_keyslot) {
+ keyslot = cryptsetup_get_keyslot_from_token(v);
+ if (keyslot < 0) {
+ /* Return a recognizable error when parsing this field, so that callers can handle parsing
+ * errors of the keyslots field gracefully, since it's not 'owned' by us, but by the LUKS2
+ * spec */
+ log_debug_errno(keyslot, "Failed to extract keyslot index from TPM2 JSON data token, skipping: %m");
+ return -EUCLEAN;
+ }
+ }
+
+ w = json_variant_by_key(v, "tpm2-pcrs");
+ if (!w)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 token data lacks 'tpm2-pcrs' field.");
+
+ r = tpm2_parse_pcr_json_array(w, &hash_pcr_mask);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse TPM2 PCR mask: %m");
+
+ /* The bank field is optional, since it was added in systemd 250 only. Before the bank was hardcoded
+ * to SHA256. */
+ w = json_variant_by_key(v, "tpm2-pcr-bank");
+ if (w) {
+ /* The PCR bank field is optional */
+
+ if (!json_variant_is_string(w))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PCR bank is not a string.");
+
+ r = tpm2_hash_alg_from_string(json_variant_string(w));
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 PCR bank invalid or not supported: %s", json_variant_string(w));
+
+ pcr_bank = r;
+ }
+
+ /* The primary key algorithm field is optional, since it was also added in systemd 250 only. Before
+ * the algorithm was hardcoded to ECC. */
+ w = json_variant_by_key(v, "tpm2-primary-alg");
+ if (w) {
+ /* The primary key algorithm is optional */
+
+ if (!json_variant_is_string(w))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 primary key algorithm is not a string.");
+
+ r = tpm2_asym_alg_from_string(json_variant_string(w));
+ if (r < 0)
+ return log_debug_errno(r, "TPM2 asymmetric algorithm invalid or not supported: %s", json_variant_string(w));
+
+ primary_alg = r;
+ }
+
+ w = json_variant_by_key(v, "tpm2-blob");
+ if (!w)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 token data lacks 'tpm2-blob' field.");
+
+ r = json_variant_unbase64(w, &blob, &blob_size);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid base64 data in 'tpm2-blob' field.");
+
+ w = json_variant_by_key(v, "tpm2-policy-hash");
+ if (!w)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 token data lacks 'tpm2-policy-hash' field.");
+
+ r = json_variant_unhex(w, &policy_hash, &policy_hash_size);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid base64 data in 'tpm2-policy-hash' field.");
+
+ w = json_variant_by_key(v, "tpm2-pin");
+ if (w) {
+ if (!json_variant_is_boolean(w))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 PIN policy is not a boolean.");
+
+ SET_FLAG(flags, TPM2_FLAGS_USE_PIN, json_variant_boolean(w));
+ }
+
+ w = json_variant_by_key(v, "tpm2_pcrlock");
+ if (w) {
+ if (!json_variant_is_boolean(w))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "TPM2 pclock policy is not a boolean.");
+
+ SET_FLAG(flags, TPM2_FLAGS_USE_PCRLOCK, json_variant_boolean(w));
+ }
+
+ w = json_variant_by_key(v, "tpm2_salt");
+ if (w) {
+ r = json_variant_unbase64(w, &salt, &salt_size);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid base64 data in 'tpm2_salt' field.");
+ }
+
+ w = json_variant_by_key(v, "tpm2_pubkey_pcrs");
+ if (w) {
+ r = tpm2_parse_pcr_json_array(w, &pubkey_pcr_mask);
+ if (r < 0)
+ return r;
+ }
+
+ w = json_variant_by_key(v, "tpm2_pubkey");
+ if (w) {
+ r = json_variant_unbase64(w, &pubkey, &pubkey_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to decode PCR public key.");
+ } else if (pubkey_pcr_mask != 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Public key PCR mask set, but not public key included in JSON data, refusing.");
+
+ w = json_variant_by_key(v, "tpm2_srk");
+ if (w) {
+ r = json_variant_unbase64(w, &srk_buf, &srk_buf_size);
+ if (r < 0)
+ return log_debug_errno(r, "Invalid base64 data in 'tpm2_srk' field.");
+ }
+
+ if (ret_keyslot)
+ *ret_keyslot = keyslot;
+ if (ret_hash_pcr_mask)
+ *ret_hash_pcr_mask = hash_pcr_mask;
+ if (ret_pcr_bank)
+ *ret_pcr_bank = pcr_bank;
+ if (ret_pubkey)
+ *ret_pubkey = TAKE_PTR(pubkey);
+ if (ret_pubkey_size)
+ *ret_pubkey_size = pubkey_size;
+ if (ret_pubkey_pcr_mask)
+ *ret_pubkey_pcr_mask = pubkey_pcr_mask;
+ if (ret_primary_alg)
+ *ret_primary_alg = primary_alg;
+ if (ret_blob)
+ *ret_blob = TAKE_PTR(blob);
+ if (ret_blob_size)
+ *ret_blob_size = blob_size;
+ if (ret_policy_hash)
+ *ret_policy_hash = TAKE_PTR(policy_hash);
+ if (ret_policy_hash_size)
+ *ret_policy_hash_size = policy_hash_size;
+ if (ret_salt)
+ *ret_salt = TAKE_PTR(salt);
+ if (ret_salt_size)
+ *ret_salt_size = salt_size;
+ if (ret_flags)
+ *ret_flags = flags;
+ if (ret_srk_buf)
+ *ret_srk_buf = TAKE_PTR(srk_buf);
+ if (ret_srk_buf_size)
+ *ret_srk_buf_size = srk_buf_size;
+
+ return 0;
+}
+
+int tpm2_hash_alg_to_size(uint16_t alg) {
+ switch (alg) {
+ case TPM2_ALG_SHA1:
+ return 20;
+ case TPM2_ALG_SHA256:
+ return 32;
+ case TPM2_ALG_SHA384:
+ return 48;
+ case TPM2_ALG_SHA512:
+ return 64;
+ default:
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown hash algorithm id 0x%" PRIx16, alg);
+ }
+}
+
+const char *tpm2_hash_alg_to_string(uint16_t alg) {
+ switch (alg) {
+ case TPM2_ALG_SHA1:
+ return "sha1";
+ case TPM2_ALG_SHA256:
+ return "sha256";
+ case TPM2_ALG_SHA384:
+ return "sha384";
+ case TPM2_ALG_SHA512:
+ return "sha512";
+ default:
+ log_debug("Unknown hash algorithm id 0x%" PRIx16, alg);
+ return NULL;
+ }
+}
+
+int tpm2_hash_alg_from_string(const char *alg) {
+ if (strcaseeq_ptr(alg, "sha1"))
+ return TPM2_ALG_SHA1;
+ if (strcaseeq_ptr(alg, "sha256"))
+ return TPM2_ALG_SHA256;
+ if (strcaseeq_ptr(alg, "sha384"))
+ return TPM2_ALG_SHA384;
+ if (strcaseeq_ptr(alg, "sha512"))
+ return TPM2_ALG_SHA512;
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown hash algorithm name '%s'", alg);
+}
+
+const char *tpm2_asym_alg_to_string(uint16_t alg) {
+ switch (alg) {
+ case TPM2_ALG_ECC:
+ return "ecc";
+ case TPM2_ALG_RSA:
+ return "rsa";
+ default:
+ log_debug("Unknown asymmetric algorithm id 0x%" PRIx16, alg);
+ return NULL;
+ }
+}
+
+int tpm2_asym_alg_from_string(const char *alg) {
+ if (strcaseeq_ptr(alg, "ecc"))
+ return TPM2_ALG_ECC;
+ if (strcaseeq_ptr(alg, "rsa"))
+ return TPM2_ALG_RSA;
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown asymmetric algorithm name '%s'", alg);
+}
+
+const char *tpm2_sym_alg_to_string(uint16_t alg) {
+ switch (alg) {
+#if HAVE_TPM2
+ case TPM2_ALG_AES:
+ return "aes";
+#endif
+ default:
+ log_debug("Unknown symmetric algorithm id 0x%" PRIx16, alg);
+ return NULL;
+ }
+}
+
+int tpm2_sym_alg_from_string(const char *alg) {
+#if HAVE_TPM2
+ if (strcaseeq_ptr(alg, "aes"))
+ return TPM2_ALG_AES;
+#endif
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown symmetric algorithm name '%s'", alg);
+}
+
+const char *tpm2_sym_mode_to_string(uint16_t mode) {
+ switch (mode) {
+#if HAVE_TPM2
+ case TPM2_ALG_CTR:
+ return "ctr";
+ case TPM2_ALG_OFB:
+ return "ofb";
+ case TPM2_ALG_CBC:
+ return "cbc";
+ case TPM2_ALG_CFB:
+ return "cfb";
+ case TPM2_ALG_ECB:
+ return "ecb";
+#endif
+ default:
+ log_debug("Unknown symmetric mode id 0x%" PRIx16, mode);
+ return NULL;
+ }
+}
+
+int tpm2_sym_mode_from_string(const char *mode) {
+#if HAVE_TPM2
+ if (strcaseeq_ptr(mode, "ctr"))
+ return TPM2_ALG_CTR;
+ if (strcaseeq_ptr(mode, "ofb"))
+ return TPM2_ALG_OFB;
+ if (strcaseeq_ptr(mode, "cbc"))
+ return TPM2_ALG_CBC;
+ if (strcaseeq_ptr(mode, "cfb"))
+ return TPM2_ALG_CFB;
+ if (strcaseeq_ptr(mode, "ecb"))
+ return TPM2_ALG_ECB;
+#endif
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown symmetric mode name '%s'", mode);
+}
+
+Tpm2Support tpm2_support(void) {
+ Tpm2Support support = TPM2_SUPPORT_NONE;
+ int r;
+
+ if (detect_container() <= 0) {
+ /* Check if there's a /dev/tpmrm* device via sysfs. If we run in a container we likely just
+ * got the host sysfs mounted. Since devices are generally not virtualized for containers,
+ * let's assume containers never have a TPM, at least for now. */
+
+ r = dir_is_empty("/sys/class/tpmrm", /* ignore_hidden_or_backup= */ false);
+ if (r < 0) {
+ if (r != -ENOENT)
+ log_debug_errno(r, "Unable to test whether /sys/class/tpmrm/ exists and is populated, assuming it is not: %m");
+ } else if (r == 0) /* populated! */
+ support |= TPM2_SUPPORT_SUBSYSTEM|TPM2_SUPPORT_DRIVER;
+ else
+ /* If the directory exists but is empty, we know the subsystem is enabled but no
+ * driver has been loaded yet. */
+ support |= TPM2_SUPPORT_SUBSYSTEM;
+ }
+
+ if (efi_has_tpm2())
+ support |= TPM2_SUPPORT_FIRMWARE;
+
+#if HAVE_TPM2
+ support |= TPM2_SUPPORT_SYSTEM;
+
+ r = dlopen_tpm2();
+ if (r >= 0)
+ support |= TPM2_SUPPORT_LIBRARIES;
+#endif
+
+ return support;
+}
+
+#if HAVE_TPM2
+static void tpm2_pcr_values_apply_default_hash_alg(Tpm2PCRValue *pcr_values, size_t n_pcr_values) {
+ TPMI_ALG_HASH default_hash = 0;
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values)
+ if (v->hash != 0) {
+ default_hash = v->hash;
+ break;
+ }
+
+ if (default_hash != 0)
+ FOREACH_ARRAY(v, pcr_values, n_pcr_values)
+ if (v->hash == 0)
+ v->hash = default_hash;
+}
+#endif
+
+/* The following tpm2_parse_pcr_argument*() functions all log errors, to match the behavior of system-wide
+ * parse_*_argument() functions. */
+
+/* Parse the PCR selection/value arg(s) and return a corresponding array of Tpm2PCRValue objects.
+ *
+ * The format is the same as tpm2_pcr_values_from_string(). The first provided entry with a hash algorithm
+ * set will be used as the 'default' hash algorithm. All entries with an unset hash algorithm will be updated
+ * with the 'default' hash algorithm. The resulting array will be sorted and checked for validity.
+ *
+ * This will replace *ret_pcr_values with the new array of pcr values; to append to an existing array, use
+ * tpm2_parse_pcr_argument_append(). */
+int tpm2_parse_pcr_argument(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values) {
+#if HAVE_TPM2
+ int r;
+
+ assert(arg);
+ assert(ret_pcr_values);
+ assert(ret_n_pcr_values);
+
+ _cleanup_free_ Tpm2PCRValue *pcr_values = NULL;
+ size_t n_pcr_values = 0;
+ r = tpm2_pcr_values_from_string(arg, &pcr_values, &n_pcr_values);
+ if (r < 0)
+ return log_error_errno(r, "Could not parse PCR values from '%s': %m", arg);
+
+ tpm2_pcr_values_apply_default_hash_alg(pcr_values, n_pcr_values);
+
+ tpm2_sort_pcr_values(pcr_values, n_pcr_values);
+
+ if (!tpm2_pcr_values_valid(pcr_values, n_pcr_values))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Parsed PCR values are not valid.");
+
+ *ret_pcr_values = TAKE_PTR(pcr_values);
+ *ret_n_pcr_values = n_pcr_values;
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support is disabled.");
+#endif
+}
+
+/* Same as tpm2_parse_pcr_argument(), but the pcr values array is appended to. If the provided pcr values
+ * array is not NULL, it must point to an allocated pcr values array and the provided number of pcr values
+ * must be correct.
+ *
+ * Note that 'arg' is parsed into a new array of pcr values independently of any previous pcr values,
+ * including application of the default hash algorithm. Then the two arrays are combined, the default hash
+ * algorithm check applied again (in case either the previous or current array had no default hash
+ * algorithm), and then the resulting array is sorted and rechecked for validity. */
+int tpm2_parse_pcr_argument_append(const char *arg, Tpm2PCRValue **pcr_values, size_t *n_pcr_values) {
+#if HAVE_TPM2
+ int r;
+
+ assert(arg);
+ assert(pcr_values);
+ assert(n_pcr_values);
+
+ _cleanup_free_ Tpm2PCRValue *more_pcr_values = NULL;
+ size_t n_more_pcr_values;
+ r = tpm2_parse_pcr_argument(arg, &more_pcr_values, &n_more_pcr_values);
+ if (r < 0)
+ return r;
+
+ /* If we got previous values, append them. */
+ if (*pcr_values && !GREEDY_REALLOC_APPEND(more_pcr_values, n_more_pcr_values, *pcr_values, *n_pcr_values))
+ return log_oom();
+
+ tpm2_pcr_values_apply_default_hash_alg(more_pcr_values, n_more_pcr_values);
+
+ tpm2_sort_pcr_values(more_pcr_values, n_more_pcr_values);
+
+ if (!tpm2_pcr_values_valid(more_pcr_values, n_more_pcr_values))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Parsed PCR values are not valid.");
+
+ SWAP_TWO(*pcr_values, more_pcr_values);
+ *n_pcr_values = n_more_pcr_values;
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support is disabled.");
+#endif
+}
+
+/* Same as tpm2_parse_pcr_argument() but converts the pcr values to a pcr mask. If more than one hash
+ * algorithm is included in the pcr values array this results in error. This retains the previous behavior of
+ * tpm2_parse_pcr_argument() of clearing the mask if 'arg' is empty, replacing the mask if it is set to
+ * UINT32_MAX, and or-ing the mask otherwise. */
+int tpm2_parse_pcr_argument_to_mask(const char *arg, uint32_t *ret_mask) {
+#if HAVE_TPM2
+ _cleanup_free_ Tpm2PCRValue *pcr_values = NULL;
+ size_t n_pcr_values;
+ int r;
+
+ assert(arg);
+ assert(ret_mask);
+
+ r = tpm2_parse_pcr_argument(arg, &pcr_values, &n_pcr_values);
+ if (r < 0)
+ return r;
+
+ if (n_pcr_values == 0) {
+ /* This retains the previous behavior of clearing the mask if the arg is empty */
+ *ret_mask = 0;
+ return 0;
+ }
+
+ size_t hash_count;
+ r = tpm2_pcr_values_hash_count(pcr_values, n_pcr_values, &hash_count);
+ if (r < 0)
+ return log_error_errno(r, "Could not get hash count from pcr values: %m");
+
+ if (hash_count > 1)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Multiple PCR hash banks selected.");
+
+ uint32_t new_mask;
+ r = tpm2_pcr_values_to_mask(pcr_values, n_pcr_values, pcr_values[0].hash, &new_mask);
+ if (r < 0)
+ return log_error_errno(r, "Could not get pcr values mask: %m");
+
+ if (*ret_mask == UINT32_MAX)
+ *ret_mask = new_mask;
+ else
+ *ret_mask |= new_mask;
+
+ return 0;
+#else
+ return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM2 support is disabled.");
+#endif
+}
+
+int tpm2_load_pcr_signature(const char *path, JsonVariant **ret) {
+ _cleanup_strv_free_ char **search = NULL;
+ _cleanup_free_ char *discovered_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ /* Tries to load a JSON PCR signature file. Takes an absolute path, a simple file name or NULL. In
+ * the latter two cases searches in /etc/, /usr/lib/, /run/, as usual. */
+
+ search = strv_split_nulstr(CONF_PATHS_NULSTR("systemd"));
+ if (!search)
+ return log_oom_debug();
+
+ if (!path) {
+ /* If no path is specified, then look for "tpm2-pcr-signature.json" automatically. Also, in
+ * this case include /.extra/ in the search path, but only in this case, and if we run in the
+ * initrd. We don't want to be too eager here, after all /.extra/ is untrusted territory. */
+
+ path = "tpm2-pcr-signature.json";
+
+ if (in_initrd())
+ if (strv_extend(&search, "/.extra") < 0)
+ return log_oom_debug();
+ }
+
+ r = search_and_fopen(path, "re", NULL, (const char**) search, &f, &discovered_path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to find TPM PCR signature file '%s': %m", path);
+
+ r = json_parse_file(f, discovered_path, 0, ret, NULL, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to parse TPM PCR signature JSON object '%s': %m", discovered_path);
+
+ return 0;
+}
+
+int tpm2_load_pcr_public_key(const char *path, void **ret_pubkey, size_t *ret_pubkey_size) {
+ _cleanup_free_ char *discovered_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ /* Tries to load a PCR public key file. Takes an absolute path, a simple file name or NULL. In the
+ * latter two cases searches in /etc/, /usr/lib/, /run/, as usual. */
+
+ if (!path)
+ path = "tpm2-pcr-public-key.pem";
+
+ r = search_and_fopen(path, "re", NULL, (const char**) CONF_PATHS_STRV("systemd"), &f, &discovered_path);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to find TPM PCR public key file '%s': %m", path);
+
+ r = read_full_stream(f, (char**) ret_pubkey, ret_pubkey_size);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to load TPM PCR public key PEM file '%s': %m", discovered_path);
+
+ return 0;
+}
+
+#define PBKDF2_HMAC_SHA256_ITERATIONS 10000
+
+/*
+ * Implements PBKDF2 HMAC SHA256 for a derived keylen of 32
+ * bytes and for PBKDF2_HMAC_SHA256_ITERATIONS count.
+ * I found the wikipedia entry relevant and it contains links to
+ * relevant RFCs:
+ * - https://en.wikipedia.org/wiki/PBKDF2
+ * - https://www.rfc-editor.org/rfc/rfc2898#section-5.2
+ */
+int tpm2_util_pbkdf2_hmac_sha256(const void *pass,
+ size_t passlen,
+ const void *salt,
+ size_t saltlen,
+ uint8_t ret_key[static SHA256_DIGEST_SIZE]) {
+
+ uint8_t _cleanup_(erase_and_freep) *buffer = NULL;
+ uint8_t u[SHA256_DIGEST_SIZE];
+
+ /* To keep this simple, since derived KeyLen (dkLen in docs)
+ * Is the same as the hash output, we don't need multiple
+ * blocks. Part of the algorithm is to add the block count
+ * in, but this can be hardcoded to 1.
+ */
+ static const uint8_t block_cnt[] = { 0, 0, 0, 1 };
+
+ assert (salt);
+ assert (saltlen > 0);
+ assert (saltlen <= (SIZE_MAX - sizeof(block_cnt)));
+ assert (passlen > 0);
+
+ /*
+ * Build a buffer of salt + block_cnt and hmac_sha256 it we
+ * do this as we don't have a context builder for HMAC_SHA256.
+ */
+ buffer = malloc(saltlen + sizeof(block_cnt));
+ if (!buffer)
+ return -ENOMEM;
+
+ memcpy(buffer, salt, saltlen);
+ memcpy(&buffer[saltlen], block_cnt, sizeof(block_cnt));
+
+ hmac_sha256(pass, passlen, buffer, saltlen + sizeof(block_cnt), u);
+
+ /* dk needs to be an unmodified u as u gets modified in the loop */
+ memcpy(ret_key, u, SHA256_DIGEST_SIZE);
+ uint8_t *dk = ret_key;
+
+ for (size_t i = 1; i < PBKDF2_HMAC_SHA256_ITERATIONS; i++) {
+ hmac_sha256(pass, passlen, u, sizeof(u), u);
+
+ for (size_t j=0; j < sizeof(u); j++)
+ dk[j] ^= u[j];
+ }
+
+ return 0;
+}
+
+static const char* const tpm2_pcr_index_table[_TPM2_PCR_INDEX_MAX_DEFINED] = {
+ [TPM2_PCR_PLATFORM_CODE] = "platform-code",
+ [TPM2_PCR_PLATFORM_CONFIG] = "platform-config",
+ [TPM2_PCR_EXTERNAL_CODE] = "external-code",
+ [TPM2_PCR_EXTERNAL_CONFIG] = "external-config",
+ [TPM2_PCR_BOOT_LOADER_CODE] = "boot-loader-code",
+ [TPM2_PCR_BOOT_LOADER_CONFIG] = "boot-loader-config",
+ [TPM2_PCR_HOST_PLATFORM] = "host-platform",
+ [TPM2_PCR_SECURE_BOOT_POLICY] = "secure-boot-policy",
+ [TPM2_PCR_KERNEL_INITRD] = "kernel-initrd",
+ [TPM2_PCR_IMA] = "ima",
+ [TPM2_PCR_KERNEL_BOOT] = "kernel-boot",
+ [TPM2_PCR_KERNEL_CONFIG] = "kernel-config",
+ [TPM2_PCR_SYSEXTS] = "sysexts",
+ [TPM2_PCR_SHIM_POLICY] = "shim-policy",
+ [TPM2_PCR_SYSTEM_IDENTITY] = "system-identity",
+ [TPM2_PCR_DEBUG] = "debug",
+ [TPM2_PCR_APPLICATION_SUPPORT] = "application-support",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_FROM_STRING_WITH_FALLBACK(tpm2_pcr_index, int, TPM2_PCRS_MAX - 1);
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(tpm2_pcr_index, int);
diff --git a/src/shared/tpm2-util.h b/src/shared/tpm2-util.h
new file mode 100644
index 0000000..55d7481
--- /dev/null
+++ b/src/shared/tpm2-util.h
@@ -0,0 +1,478 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bitfield.h"
+#include "io-util.h"
+#include "json.h"
+#include "macro.h"
+#include "openssl-util.h"
+#include "ordered-set.h"
+#include "sha256.h"
+#include "tpm2-pcr.h"
+
+typedef enum TPM2Flags {
+ TPM2_FLAGS_USE_PIN = 1 << 0,
+ TPM2_FLAGS_USE_PCRLOCK = 1 << 1,
+} TPM2Flags;
+
+/* As per https://trustedcomputinggroup.org/wp-content/uploads/TCG_PCClient_PFP_r1p05_v23_pub.pdf a
+ * TPM2 on a Client PC must have at least 24 PCRs. This hardcodes our expectation of 24. */
+#define TPM2_PCRS_MAX 24U
+#define TPM2_PCRS_MASK ((UINT32_C(1) << TPM2_PCRS_MAX) - 1)
+
+/* The SRK handle is defined in the Provisioning Guidance document (see above) in the table "Reserved Handles
+ * for TPM Provisioning Fundamental Elements". The SRK is useful because it is "shared", meaning it has no
+ * authValue nor authPolicy set, and thus may be used by anyone on the system to generate derived keys or
+ * seal secrets. This is useful if the TPM has an auth (password) set for the 'owner hierarchy', which would
+ * prevent users from generating primary transient keys, unless they knew the owner hierarchy auth. See
+ * the Provisioning Guidance document for more details. */
+#define TPM2_SRK_HANDLE UINT32_C(0x81000001)
+
+/* The TPM specification limits sealed data to MAX_SYM_DATA. Unfortunately, tpm2-tss incorrectly
+ * defines this value as 256; the TPM specification Part 2 ("Structures") section
+ * "TPMU_SENSITIVE_CREATE" states "For interoperability, MAX_SYM_DATA should be 128." */
+#define TPM2_MAX_SEALED_DATA UINT16_C(128)
+
+static inline bool TPM2_PCR_INDEX_VALID(unsigned pcr) {
+ return pcr < TPM2_PCRS_MAX;
+}
+static inline bool TPM2_PCR_MASK_VALID(uint32_t pcr_mask) {
+ return pcr_mask <= TPM2_PCRS_MASK;
+}
+
+#define FOREACH_PCR_IN_MASK(pcr, mask) BIT_FOREACH(pcr, mask)
+
+#define TPM2_N_HASH_ALGORITHMS 4U
+
+#if HAVE_TPM2
+
+#include <tss2/tss2_esys.h>
+#include <tss2/tss2_mu.h>
+#include <tss2/tss2_rc.h>
+
+int dlopen_tpm2(void);
+
+typedef struct {
+ unsigned n_ref;
+
+ void *tcti_dl;
+ TSS2_TCTI_CONTEXT *tcti_context;
+ ESYS_CONTEXT *esys_context;
+
+ /* Some selected cached capabilities of the TPM */
+ TPMS_ALG_PROPERTY *capability_algorithms;
+ size_t n_capability_algorithms;
+ TPMA_CC *capability_commands;
+ size_t n_capability_commands;
+ TPM2_ECC_CURVE *capability_ecc_curves;
+ size_t n_capability_ecc_curves;
+ TPML_PCR_SELECTION capability_pcrs;
+} Tpm2Context;
+
+int tpm2_context_new(const char *device, Tpm2Context **ret_context);
+Tpm2Context *tpm2_context_ref(Tpm2Context *context);
+Tpm2Context *tpm2_context_unref(Tpm2Context *context);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Tpm2Context*, tpm2_context_unref);
+
+typedef struct {
+ Tpm2Context *tpm2_context;
+ ESYS_TR esys_handle;
+
+ bool flush;
+} Tpm2Handle;
+
+#define _tpm2_handle(c, h) { .tpm2_context = (c), .esys_handle = (h), }
+static const Tpm2Handle TPM2_HANDLE_NONE = _tpm2_handle(NULL, ESYS_TR_NONE);
+
+void Esys_Freep(void *p);
+
+int tpm2_handle_new(Tpm2Context *context, Tpm2Handle **ret_handle);
+Tpm2Handle *tpm2_handle_free(Tpm2Handle *handle);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Tpm2Handle*, tpm2_handle_free);
+
+typedef struct {
+ unsigned index;
+ TPMI_ALG_HASH hash;
+ TPM2B_DIGEST value;
+} Tpm2PCRValue;
+
+#define TPM2_PCR_VALUE_MAKE(i, h, v) \
+ (Tpm2PCRValue) { \
+ .index = (i), \
+ .hash = (h), \
+ .value = ((TPM2B_DIGEST) v), \
+ }
+
+bool tpm2_pcr_value_valid(const Tpm2PCRValue *pcr_value);
+bool tpm2_pcr_values_has_any_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values);
+bool tpm2_pcr_values_has_all_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values);
+int tpm2_pcr_value_from_string(const char *arg, Tpm2PCRValue *ret_pcr_value);
+char *tpm2_pcr_value_to_string(const Tpm2PCRValue *pcr_value);
+
+bool tpm2_pcr_values_valid(const Tpm2PCRValue *pcr_values, size_t n_pcr_values);
+void tpm2_sort_pcr_values(Tpm2PCRValue *pcr_values, size_t n_pcr_values);
+int tpm2_pcr_values_from_mask(uint32_t mask, TPMI_ALG_HASH hash, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values);
+int tpm2_pcr_values_to_mask(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPMI_ALG_HASH hash, uint32_t *ret_mask);
+int tpm2_pcr_values_from_string(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values);
+char *tpm2_pcr_values_to_string(const Tpm2PCRValue *pcr_values, size_t n_pcr_values);
+int tpm2_pcr_values_hash_count(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, size_t *ret_count);
+int tpm2_tpml_pcr_selection_from_pcr_values(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPML_PCR_SELECTION *ret_selection, TPM2B_DIGEST **ret_values, size_t *ret_n_values);
+
+int tpm2_make_encryption_session(Tpm2Context *c, const Tpm2Handle *primary, const Tpm2Handle *bind_key, Tpm2Handle **ret_session);
+
+int tpm2_create_primary(Tpm2Context *c, const Tpm2Handle *session, const TPM2B_PUBLIC *template, const TPM2B_SENSITIVE_CREATE *sensitive, TPM2B_PUBLIC **ret_public, Tpm2Handle **ret_handle);
+int tpm2_create(Tpm2Context *c, const Tpm2Handle *parent, const Tpm2Handle *session, const TPMT_PUBLIC *template, const TPMS_SENSITIVE_CREATE *sensitive, TPM2B_PUBLIC **ret_public, TPM2B_PRIVATE **ret_private);
+int tpm2_create_loaded(Tpm2Context *c, const Tpm2Handle *parent, const Tpm2Handle *session, const TPMT_PUBLIC *template, const TPMS_SENSITIVE_CREATE *sensitive, TPM2B_PUBLIC **ret_public, TPM2B_PRIVATE **ret_private, Tpm2Handle **ret_handle);
+int tpm2_load(Tpm2Context *c, const Tpm2Handle *parent, const Tpm2Handle *session, const TPM2B_PUBLIC *public, const TPM2B_PRIVATE *private, Tpm2Handle **ret_handle);
+int tpm2_marshal_public(const TPM2B_PUBLIC *public, void **ret, size_t *ret_size);
+int tpm2_marshal_nv_public(const TPM2B_NV_PUBLIC *nv_public, void **ret, size_t *ret_size);
+int tpm2_unmarshal_nv_public(const void *data, size_t size, TPM2B_NV_PUBLIC *ret_nv_public);
+int tpm2_marshal_blob(const TPM2B_PUBLIC *public, const TPM2B_PRIVATE *private, const TPM2B_ENCRYPTED_SECRET *seed, void **ret_blob, size_t *ret_blob_size);
+int tpm2_unmarshal_blob(const void *blob, size_t blob_size, TPM2B_PUBLIC *ret_public, TPM2B_PRIVATE *ret_private, TPM2B_ENCRYPTED_SECRET *ret_seed);
+
+bool tpm2_supports_alg(Tpm2Context *c, TPM2_ALG_ID alg);
+bool tpm2_supports_command(Tpm2Context *c, TPM2_CC command);
+bool tpm2_supports_ecc_curve(Tpm2Context *c, TPM2_ECC_CURVE ecc_curve);
+
+bool tpm2_test_parms(Tpm2Context *c, TPMI_ALG_PUBLIC alg, const TPMU_PUBLIC_PARMS *parms);
+
+int tpm2_get_good_pcr_banks(Tpm2Context *c, uint32_t pcr_mask, TPMI_ALG_HASH **ret_banks);
+int tpm2_get_good_pcr_banks_strv(Tpm2Context *c, uint32_t pcr_mask, char ***ret);
+int tpm2_get_best_pcr_bank(Tpm2Context *c, uint32_t pcr_mask, TPMI_ALG_HASH *ret);
+
+const char *tpm2_userspace_log_path(void);
+const char *tpm2_firmware_log_path(void);
+
+typedef enum Tpm2UserspaceEventType {
+ TPM2_EVENT_PHASE,
+ TPM2_EVENT_FILESYSTEM,
+ TPM2_EVENT_VOLUME_KEY,
+ TPM2_EVENT_MACHINE_ID,
+ _TPM2_USERSPACE_EVENT_TYPE_MAX,
+ _TPM2_USERSPACE_EVENT_TYPE_INVALID = -EINVAL,
+} Tpm2UserspaceEventType;
+
+const char* tpm2_userspace_event_type_to_string(Tpm2UserspaceEventType type) _const_;
+Tpm2UserspaceEventType tpm2_userspace_event_type_from_string(const char *s) _pure_;
+
+int tpm2_extend_bytes(Tpm2Context *c, char **banks, unsigned pcr_index, const void *data, size_t data_size, const void *secret, size_t secret_size, Tpm2UserspaceEventType event, const char *description);
+
+uint32_t tpm2_tpms_pcr_selection_to_mask(const TPMS_PCR_SELECTION *s);
+void tpm2_tpms_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash, TPMS_PCR_SELECTION *ret);
+bool tpm2_tpms_pcr_selection_has_mask(const TPMS_PCR_SELECTION *s, uint32_t mask);
+void tpm2_tpms_pcr_selection_add_mask(TPMS_PCR_SELECTION *s, uint32_t mask);
+void tpm2_tpms_pcr_selection_sub_mask(TPMS_PCR_SELECTION *s, uint32_t mask);
+void tpm2_tpms_pcr_selection_add(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b);
+void tpm2_tpms_pcr_selection_sub(TPMS_PCR_SELECTION *a, const TPMS_PCR_SELECTION *b);
+void tpm2_tpms_pcr_selection_move(TPMS_PCR_SELECTION *a, TPMS_PCR_SELECTION *b);
+char *tpm2_tpms_pcr_selection_to_string(const TPMS_PCR_SELECTION *s);
+size_t tpm2_tpms_pcr_selection_weight(const TPMS_PCR_SELECTION *s);
+#define tpm2_tpms_pcr_selection_is_empty(s) (tpm2_tpms_pcr_selection_weight(s) == 0)
+
+uint32_t tpm2_tpml_pcr_selection_to_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash);
+void tpm2_tpml_pcr_selection_from_mask(uint32_t mask, TPMI_ALG_HASH hash, TPML_PCR_SELECTION *ret);
+bool tpm2_tpml_pcr_selection_has_mask(const TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask);
+void tpm2_tpml_pcr_selection_add_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask);
+void tpm2_tpml_pcr_selection_sub_mask(TPML_PCR_SELECTION *l, TPMI_ALG_HASH hash, uint32_t mask);
+void tpm2_tpml_pcr_selection_add_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s);
+void tpm2_tpml_pcr_selection_sub_tpms_pcr_selection(TPML_PCR_SELECTION *l, const TPMS_PCR_SELECTION *s);
+void tpm2_tpml_pcr_selection_add(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b);
+void tpm2_tpml_pcr_selection_sub(TPML_PCR_SELECTION *a, const TPML_PCR_SELECTION *b);
+char *tpm2_tpml_pcr_selection_to_string(const TPML_PCR_SELECTION *l);
+size_t tpm2_tpml_pcr_selection_weight(const TPML_PCR_SELECTION *l);
+#define tpm2_tpml_pcr_selection_is_empty(l) (tpm2_tpml_pcr_selection_weight(l) == 0)
+
+int tpm2_digest_many(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest, const struct iovec data[], size_t count, bool extend);
+static inline int tpm2_digest_buffer(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest, const void *data, size_t len, bool extend) {
+ return tpm2_digest_many(alg, digest, &IOVEC_MAKE((void*) data, len), 1, extend);
+}
+int tpm2_digest_many_digests(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest, const TPM2B_DIGEST data[], size_t count, bool extend);
+static inline int tpm2_digest_rehash(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest) {
+ return tpm2_digest_many(alg, digest, NULL, 0, true);
+}
+static inline int tpm2_digest_init(TPMI_ALG_HASH alg, TPM2B_DIGEST *digest) {
+ return tpm2_digest_many(alg, digest, NULL, 0, false);
+}
+
+void tpm2_log_debug_tpml_pcr_selection(const TPML_PCR_SELECTION *l, const char *msg);
+void tpm2_log_debug_pcr_value(const Tpm2PCRValue *pcr_value, const char *msg);
+void tpm2_log_debug_buffer(const void *buffer, size_t size, const char *msg);
+void tpm2_log_debug_digest(const TPM2B_DIGEST *digest, const char *msg);
+void tpm2_log_debug_name(const TPM2B_NAME *name, const char *msg);
+
+typedef struct Tpm2PCRPredictionResult {
+ TPM2B_DIGEST hash[TPM2_N_HASH_ALGORITHMS]; /* a hash for each potential algorithm */
+} Tpm2PCRPredictionResult;
+
+TPM2B_DIGEST *tpm2_pcr_prediction_result_get_hash(Tpm2PCRPredictionResult *result, uint16_t alg);
+
+/* A structure encapsulating a full set of PCR predictions with alternatives. This can be converted into a
+ * series of PolicyOR + PolicyPCR items for the TPM. */
+typedef struct Tpm2PCRPrediction {
+ uint32_t pcrs; /* A mask of pcrs included */
+ OrderedSet* results[TPM2_PCRS_MAX]; /* set of Tpm2PCRPredictionResult objects, one for each PCR */
+} Tpm2PCRPrediction;
+
+void tpm2_pcr_prediction_done(Tpm2PCRPrediction *p);
+
+extern const struct hash_ops tpm2_pcr_prediction_result_hash_ops;
+
+bool tpm2_pcr_prediction_equal(Tpm2PCRPrediction *a, Tpm2PCRPrediction *b, uint16_t algorithm);
+
+int tpm2_pcr_prediction_to_json(const Tpm2PCRPrediction *prediction, uint16_t algorithm, JsonVariant **ret);
+int tpm2_pcr_prediction_from_json(Tpm2PCRPrediction *prediction, uint16_t algorithm, JsonVariant *aj);
+
+/* As structure encapsulating all metadata stored for a pcrlock policy on disk */
+typedef struct Tpm2PCRLockPolicy {
+ /* The below is the fixed metadata encoding information about the NV index we store the
+ * PolicyAuthorizeNV policy in, as well as a pinned SRK, and the encrypted PIN to use for writing to
+ * the NV Index. */
+ uint16_t algorithm;
+ uint32_t nv_index;
+ struct iovec nv_handle;
+ struct iovec nv_public;
+ struct iovec srk_handle;
+ struct iovec pin_public;
+ struct iovec pin_private;
+
+ /* The below contains the current prediction whose resulting policy is stored in the NV
+ * index. Once in JSON and once in parsed form. When the policy is updated the fields below are
+ * changed, the fields above remain fixed. */
+ JsonVariant *prediction_json;
+ Tpm2PCRPrediction prediction;
+} Tpm2PCRLockPolicy;
+
+void tpm2_pcrlock_policy_done(Tpm2PCRLockPolicy *data);
+int tpm2_pcrlock_search_file(const char *path, FILE **ret_file, char **ret_path);
+int tpm2_pcrlock_policy_load(const char *path, Tpm2PCRLockPolicy *ret_policy);
+
+int tpm2_index_to_handle(Tpm2Context *c, TPM2_HANDLE index, const Tpm2Handle *session, TPM2B_PUBLIC **ret_public, TPM2B_NAME **ret_name, TPM2B_NAME **ret_qname, Tpm2Handle **ret_handle);
+int tpm2_index_from_handle(Tpm2Context *c, const Tpm2Handle *handle, TPM2_HANDLE *ret_index);
+
+int tpm2_pcr_read(Tpm2Context *c, const TPML_PCR_SELECTION *pcr_selection, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values);
+int tpm2_pcr_read_missing_values(Tpm2Context *c, Tpm2PCRValue *pcr_values, size_t n_pcr_values);
+
+int tpm2_get_pin_auth(TPMI_ALG_HASH hash, const char *pin, TPM2B_AUTH *ret_auth);
+int tpm2_set_auth(Tpm2Context *c, const Tpm2Handle *handle, const char *pin);
+int tpm2_set_auth_binary(Tpm2Context *c, const Tpm2Handle *handle, const TPM2B_AUTH *auth);
+
+int tpm2_make_policy_session(Tpm2Context *c, const Tpm2Handle *primary, const Tpm2Handle *encryption_session, Tpm2Handle **ret_session);
+
+int tpm2_policy_auth_value(Tpm2Context *c, const Tpm2Handle *session, TPM2B_DIGEST **ret_policy_digest);
+int tpm2_policy_authorize_nv(Tpm2Context *c, const Tpm2Handle *session, const Tpm2Handle *nv_handle, TPM2B_DIGEST **ret_policy_digest);
+int tpm2_policy_pcr(Tpm2Context *c, const Tpm2Handle *session, const TPML_PCR_SELECTION *pcr_selection, TPM2B_DIGEST **ret_policy_digest);
+int tpm2_policy_or(Tpm2Context *c, const Tpm2Handle *session, const TPM2B_DIGEST *branches, size_t n_branches, TPM2B_DIGEST **ret_policy_digest);
+int tpm2_policy_super_pcr(Tpm2Context *c, const Tpm2Handle *session, const Tpm2PCRPrediction *prediction, uint16_t algorithm);
+
+int tpm2_calculate_pubkey_name(const TPMT_PUBLIC *public, TPM2B_NAME *ret_name);
+int tpm2_calculate_nv_index_name(const TPMS_NV_PUBLIC *nvpublic, TPM2B_NAME *ret_name);
+
+int tpm2_calculate_policy_auth_value(TPM2B_DIGEST *digest);
+int tpm2_calculate_policy_authorize(const TPM2B_PUBLIC *public, const TPM2B_DIGEST *policy_ref, TPM2B_DIGEST *digest);
+int tpm2_calculate_policy_authorize_nv(const TPM2B_NV_PUBLIC *public, TPM2B_DIGEST *digest);
+int tpm2_calculate_policy_pcr(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, TPM2B_DIGEST *digest);
+int tpm2_calculate_policy_or(const TPM2B_DIGEST *branches, size_t n_branches, TPM2B_DIGEST *digest);
+int tpm2_calculate_policy_super_pcr(Tpm2PCRPrediction *prediction, uint16_t algorithm, TPM2B_DIGEST *pcr_policy);
+int tpm2_calculate_serialize(TPM2_HANDLE handle, const TPM2B_NAME *name, const TPM2B_PUBLIC *public, void **ret_serialized, size_t *ret_serialized_size);
+int tpm2_calculate_sealing_policy(const Tpm2PCRValue *pcr_values, size_t n_pcr_values, const TPM2B_PUBLIC *public, bool use_pin, const Tpm2PCRLockPolicy *policy, TPM2B_DIGEST *digest);
+int tpm2_calculate_seal(TPM2_HANDLE parent_handle, const TPM2B_PUBLIC *parent_public, const TPMA_OBJECT *attributes, const void *secret, size_t secret_size, const TPM2B_DIGEST *policy, const char *pin, void **ret_secret, size_t *ret_secret_size, void **ret_blob, size_t *ret_blob_size, void **ret_serialized_parent, size_t *ret_serialized_parent_size);
+
+int tpm2_get_srk_template(TPMI_ALG_PUBLIC alg, TPMT_PUBLIC *ret_template);
+int tpm2_get_best_srk_template(Tpm2Context *c, TPMT_PUBLIC *ret_template);
+
+int tpm2_get_srk(Tpm2Context *c, const Tpm2Handle *session, TPM2B_PUBLIC **ret_public, TPM2B_NAME **ret_name, TPM2B_NAME **ret_qname, Tpm2Handle **ret_handle);
+int tpm2_get_or_create_srk(Tpm2Context *c, const Tpm2Handle *session, TPM2B_PUBLIC **ret_public, TPM2B_NAME **ret_name, TPM2B_NAME **ret_qname, Tpm2Handle **ret_handle);
+
+int tpm2_seal(Tpm2Context *c, uint32_t seal_key_handle, const TPM2B_DIGEST *policy, const char *pin, void **ret_secret, size_t *ret_secret_size, void **ret_blob, size_t *ret_blob_size, uint16_t *ret_primary_alg, void **ret_srk_buf, size_t *ret_srk_buf_size);
+int tpm2_unseal(Tpm2Context *c, uint32_t hash_pcr_mask, uint16_t pcr_bank, const void *pubkey, size_t pubkey_size, uint32_t pubkey_pcr_mask, JsonVariant *signature, const char *pin, const Tpm2PCRLockPolicy *pcrlock_policy, uint16_t primary_alg, const void *blob, size_t blob_size, const void *policy_hash, size_t policy_hash_size, const void *srk_buf, size_t srk_buf_size, void **ret_secret, size_t *ret_secret_size);
+
+#if HAVE_OPENSSL
+int tpm2_tpm2b_public_to_openssl_pkey(const TPM2B_PUBLIC *public, EVP_PKEY **ret);
+int tpm2_tpm2b_public_from_openssl_pkey(const EVP_PKEY *pkey, TPM2B_PUBLIC *ret);
+#endif
+
+int tpm2_tpm2b_public_from_pem(const void *pem, size_t pem_size, TPM2B_PUBLIC *ret);
+int tpm2_tpm2b_public_to_fingerprint(const TPM2B_PUBLIC *public, void **ret_fingerprint, size_t *ret_fingerprint_size);
+
+int tpm2_define_policy_nv_index(Tpm2Context *c, const Tpm2Handle *session, TPM2_HANDLE requested_nv_index, const TPM2B_DIGEST *write_policy, const char *pin, const TPM2B_AUTH *auth, TPM2_HANDLE *ret_nv_index, Tpm2Handle **ret_nv_handle, TPM2B_NV_PUBLIC *ret_nv_public);
+int tpm2_write_policy_nv_index(Tpm2Context *c, const Tpm2Handle *policy_session, TPM2_HANDLE nv_index, const Tpm2Handle *nv_handle, const TPM2B_DIGEST *policy_digest);
+int tpm2_undefine_policy_nv_index(Tpm2Context *c, const Tpm2Handle *session, TPM2_HANDLE nv_index, const Tpm2Handle *nv_handle);
+
+int tpm2_seal_data(Tpm2Context *c, const struct iovec *data, const Tpm2Handle *primary_handle, const Tpm2Handle *encryption_session, const TPM2B_DIGEST *policy, struct iovec *ret_public, struct iovec *ret_private);
+int tpm2_unseal_data(Tpm2Context *c, const struct iovec *public, const struct iovec *private, const Tpm2Handle *primary_handle, const Tpm2Handle *policy_session, const Tpm2Handle *encryption_session, struct iovec *ret_data);
+
+int tpm2_serialize(Tpm2Context *c, const Tpm2Handle *handle, void **ret_serialized, size_t *ret_serialized_size);
+int tpm2_deserialize(Tpm2Context *c, const void *serialized, size_t serialized_size, Tpm2Handle **ret_handle);
+
+int tpm2_load_public_key_file(const char *path, TPM2B_PUBLIC *ret);
+
+/* The tpm2-tss library has many structs that are simply a combination of an array (or object) and
+ * size. These macros allow easily initializing or assigning instances of such structs from an existing
+ * buffer/object and size, while also checking the size for safety with the struct buffer/object size. If the
+ * provided buffer/object is NULL, the resulting struct's buffer/object will be 0s. If the provided size is
+ * larger than the struct's buffer/object size, this results in assertion failure; to check the size, use one
+ * of the TPM2B_*_CHECK_SIZE() macros. */
+#define TPM2B_AUTH_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_AUTH, buffer, size)
+#define TPM2B_DATA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_DATA, buffer, size)
+#define TPM2B_DIGEST_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_DIGEST, buffer, size)
+#define TPM2B_ECC_PARAMETER_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_ECC_PARAMETER, buffer, size)
+#define TPM2B_ENCRYPTED_SECRET_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_ENCRYPTED_SECRET, secret, size)
+#define TPM2B_MAX_BUFFER_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_MAX_BUFFER, buffer, size)
+#define TPM2B_NAME_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_NAME, name, size)
+#define TPM2B_PRIVATE_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_PRIVATE, buffer, size)
+#define TPM2B_PRIVATE_KEY_RSA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_PRIVATE_KEY_RSA, buffer, size)
+#define TPM2B_PUBLIC_KEY_RSA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_PUBLIC_KEY_RSA, buffer, size)
+#define TPM2B_SENSITIVE_DATA_MAKE(b, s) TPM2B_BUF_SIZE_STRUCT_MAKE(b, s, TPM2B_SENSITIVE_DATA, buffer, size)
+#define TPM2B_BUF_SIZE_STRUCT_MAKE(buf, size, struct_type, buffer_field, size_field) \
+ _TPM2B_BUF_SIZE_STRUCT_MAKE(buf, size, UNIQ, struct_type, buffer_field, size_field)
+#define _TPM2B_BUF_SIZE_STRUCT_MAKE(buf, size, uniq, struct_type, buffer_field, size_field) \
+ ({ \
+ typeof(buf) UNIQ_T(BUF, uniq) = (buf); \
+ typeof(size) UNIQ_T(SIZE, uniq) = (size); \
+ struct_type UNIQ_T(STRUCT, uniq) = { .size_field = UNIQ_T(SIZE, uniq), }; \
+ assert(sizeof(UNIQ_T(STRUCT, uniq).buffer_field) >= (size_t) UNIQ_T(SIZE, uniq)); \
+ if (UNIQ_T(BUF, uniq)) \
+ memcpy_safe(UNIQ_T(STRUCT, uniq).buffer_field, UNIQ_T(BUF, uniq), UNIQ_T(SIZE, uniq)); \
+ UNIQ_T(STRUCT, uniq); \
+ })
+
+/* Check if the size will fit in the TPM2B struct buffer. Returns 0 if the size will fit, otherwise this logs
+ * a debug message and returns < 0. */
+#define TPM2B_AUTH_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_AUTH, buffer)
+#define TPM2B_DATA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_DATA, buffer)
+#define TPM2B_DIGEST_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_DIGEST, buffer)
+#define TPM2B_ECC_PARAMETER_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_ECC_PARAMETER, buffer)
+#define TPM2B_ENCRYPTED_SECRET_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_ENCRYPTED_SECRET, buffer)
+#define TPM2B_MAX_BUFFER_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_MAX_BUFFER, buffer)
+#define TPM2B_NAME_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_NAME, name)
+#define TPM2B_PRIVATE_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_PRIVATE, buffer)
+#define TPM2B_PRIVATE_KEY_RSA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_PRIVATE_KEY_RSA, buffer)
+#define TPM2B_PUBLIC_KEY_RSA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_PUBLIC_KEY_RSA, buffer)
+#define TPM2B_SENSITIVE_DATA_CHECK_SIZE(s) TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(s, TPM2B_SENSITIVE_DATA, buffer)
+#define TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(size, struct_type, buffer_field) \
+ _TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(size, UNIQ, struct_type, buffer_field)
+#define _TPM2B_BUF_SIZE_STRUCT_CHECK_SIZE(size, uniq, struct_type, buffer_field) \
+ ({ \
+ size_t UNIQ_T(SIZE, uniq) = (size_t) (size); \
+ size_t UNIQ_T(BUFSIZE, uniq) = sizeof_field(struct_type, buffer_field); \
+ UNIQ_T(BUFSIZE, uniq) < UNIQ_T(SIZE, uniq) ? \
+ log_debug_errno(SYNTHETIC_ERRNO(EINVAL), \
+ "Size %zu larger than " #struct_type " buffer size %zu.", \
+ UNIQ_T(SIZE, uniq), UNIQ_T(BUFSIZE, uniq)) : \
+ 0; \
+ })
+
+#else /* HAVE_TPM2 */
+typedef struct {} Tpm2Context;
+typedef struct {} Tpm2Handle;
+typedef struct {} Tpm2PCRValue;
+
+#define TPM2_PCR_VALUE_MAKE(i, h, v) (Tpm2PCRValue) {}
+
+static inline int tpm2_pcrlock_search_file(const char *path, FILE **ret_file, char **ret_path) {
+ return -ENOENT;
+}
+
+#endif /* HAVE_TPM2 */
+
+int tpm2_list_devices(void);
+int tpm2_find_device_auto(char **ret);
+
+int tpm2_make_pcr_json_array(uint32_t pcr_mask, JsonVariant **ret);
+int tpm2_parse_pcr_json_array(JsonVariant *v, uint32_t *ret);
+
+int tpm2_make_luks2_json(int keyslot, uint32_t hash_pcr_mask, uint16_t pcr_bank, const void *pubkey, size_t pubkey_size, uint32_t pubkey_pcr_mask, uint16_t primary_alg, const void *blob, size_t blob_size, const void *policy_hash, size_t policy_hash_size, const void *salt, size_t salt_size, const void *srk_buf, size_t srk_buf_size, TPM2Flags flags, JsonVariant **ret);
+int tpm2_parse_luks2_json(JsonVariant *v, int *ret_keyslot, uint32_t *ret_hash_pcr_mask, uint16_t *ret_pcr_bank, void **ret_pubkey, size_t *ret_pubkey_size, uint32_t *ret_pubkey_pcr_mask, uint16_t *ret_primary_alg, void **ret_blob, size_t *ret_blob_size, void **ret_policy_hash, size_t *ret_policy_hash_size, void **ret_salt, size_t *ret_salt_size, void **ret_srk_buf, size_t *ret_srk_buf_size, TPM2Flags *ret_flags);
+
+/* Default to PCR 7 only */
+#define TPM2_PCR_INDEX_DEFAULT UINT32_C(7)
+#define TPM2_PCR_MASK_DEFAULT INDEX_TO_MASK(uint32_t, TPM2_PCR_INDEX_DEFAULT)
+
+/* We want the helpers below to work also if TPM2 libs are not available, hence define these four defines if
+ * they are missing. */
+#ifndef TPM2_ALG_SHA1
+#define TPM2_ALG_SHA1 0x4
+#endif
+
+#ifndef TPM2_ALG_SHA256
+#define TPM2_ALG_SHA256 0xB
+#endif
+
+#ifndef TPM2_ALG_SHA384
+#define TPM2_ALG_SHA384 0xC
+#endif
+
+#ifndef TPM2_ALG_SHA512
+#define TPM2_ALG_SHA512 0xD
+#endif
+
+#ifndef TPM2_ALG_ECC
+#define TPM2_ALG_ECC 0x23
+#endif
+
+#ifndef TPM2_ALG_RSA
+#define TPM2_ALG_RSA 0x1
+#endif
+
+int tpm2_hash_alg_to_size(uint16_t alg);
+
+const char *tpm2_hash_alg_to_string(uint16_t alg) _const_;
+int tpm2_hash_alg_from_string(const char *alg) _pure_;
+
+const char *tpm2_asym_alg_to_string(uint16_t alg) _const_;
+int tpm2_asym_alg_from_string(const char *alg) _pure_;
+
+const char *tpm2_sym_alg_to_string(uint16_t alg) _const_;
+int tpm2_sym_alg_from_string(const char *alg) _pure_;
+
+const char *tpm2_sym_mode_to_string(uint16_t mode) _const_;
+int tpm2_sym_mode_from_string(const char *mode) _pure_;
+
+char *tpm2_pcr_mask_to_string(uint32_t mask);
+
+extern const uint16_t tpm2_hash_algorithms[];
+
+typedef struct {
+ uint32_t search_pcr_mask;
+ const char *device;
+ const char *signature_path;
+ const char *pcrlock_path;
+} systemd_tpm2_plugin_params;
+
+typedef enum Tpm2Support {
+ /* NOTE! The systemd-creds tool returns these flags 1:1 as exit status. Hence these flags are pretty
+ * much ABI! Hence, be extra careful when changing/extending these definitions. */
+ TPM2_SUPPORT_NONE = 0, /* no support */
+ TPM2_SUPPORT_FIRMWARE = 1 << 0, /* firmware reports TPM2 was used */
+ TPM2_SUPPORT_DRIVER = 1 << 1, /* the kernel has a driver loaded for it */
+ TPM2_SUPPORT_SYSTEM = 1 << 2, /* we support it ourselves */
+ TPM2_SUPPORT_SUBSYSTEM = 1 << 3, /* the kernel has the tpm subsystem enabled */
+ TPM2_SUPPORT_LIBRARIES = 1 << 4, /* we can dlopen the tpm2 libraries */
+ TPM2_SUPPORT_FULL = TPM2_SUPPORT_FIRMWARE|TPM2_SUPPORT_DRIVER|TPM2_SUPPORT_SYSTEM|TPM2_SUPPORT_SUBSYSTEM|TPM2_SUPPORT_LIBRARIES,
+} Tpm2Support;
+
+Tpm2Support tpm2_support(void);
+
+int tpm2_parse_pcr_argument(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values);
+int tpm2_parse_pcr_argument_append(const char *arg, Tpm2PCRValue **ret_pcr_values, size_t *ret_n_pcr_values);
+int tpm2_parse_pcr_argument_to_mask(const char *arg, uint32_t *mask);
+
+int tpm2_load_pcr_signature(const char *path, JsonVariant **ret);
+int tpm2_load_pcr_public_key(const char *path, void **ret_pubkey, size_t *ret_pubkey_size);
+
+int tpm2_util_pbkdf2_hmac_sha256(const void *pass,
+ size_t passlen,
+ const void *salt,
+ size_t saltlen,
+ uint8_t res[static SHA256_DIGEST_SIZE]);
+
+enum {
+ /* Additional defines for the PCR index naming enum from "fundamental/tpm2-pcr.h" */
+ _TPM2_PCR_INDEX_MAX_DEFINED = TPM2_PCRS_MAX,
+ _TPM2_PCR_INDEX_INVALID = -EINVAL,
+};
+
+int tpm2_pcr_index_from_string(const char *s) _pure_;
+const char *tpm2_pcr_index_to_string(int pcr) _const_;
diff --git a/src/shared/udev-util.c b/src/shared/udev-util.c
new file mode 100644
index 0000000..cf28ba8
--- /dev/null
+++ b/src/shared/udev-util.c
@@ -0,0 +1,439 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <ctype.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "device-nodes.h"
+#include "device-private.h"
+#include "device-util.h"
+#include "env-file.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "id128-util.h"
+#include "log.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "signal-util.h"
+#include "stat-util.h"
+#include "string-util.h"
+#include "udev-util.h"
+#include "utf8.h"
+
+int udev_set_max_log_level(char *str) {
+ size_t n;
+
+ /* This may modify input string. */
+
+ if (isempty(str))
+ return 0;
+
+ /* unquote */
+ n = strlen(str);
+ if (n >= 2 &&
+ ((str[0] == '"' && str[n - 1] == '"') ||
+ (str[0] == '\'' && str[n - 1] == '\''))) {
+ str[n - 1] = '\0';
+ str++;
+ }
+
+ /* we set the udev log level here explicitly, this is supposed
+ * to regulate the code in libudev/ and udev/. */
+ return log_set_max_level_from_string(str);
+}
+
+int udev_parse_config(void) {
+ _cleanup_free_ char *log_val = NULL;
+ int r;
+
+ r = parse_env_file(NULL, "/etc/udev/udev.conf",
+ "udev_log", &log_val);
+ if (r == -ENOENT)
+ return 0;
+ if (r < 0)
+ return r;
+
+ r = udev_set_max_log_level(log_val);
+ if (r < 0)
+ log_syntax(NULL, LOG_WARNING, "/etc/udev/udev.conf", 0, r,
+ "Failed to set udev log level '%s', ignoring: %m", log_val);
+
+ return 0;
+}
+
+struct DeviceMonitorData {
+ const char *sysname;
+ const char *devlink;
+ sd_device *device;
+};
+
+static void device_monitor_data_free(struct DeviceMonitorData *d) {
+ assert(d);
+
+ sd_device_unref(d->device);
+}
+
+static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) {
+ struct DeviceMonitorData *data = ASSERT_PTR(userdata);
+ const char *sysname;
+
+ assert(device);
+ assert(data->sysname || data->devlink);
+ assert(!data->device);
+
+ /* Ignore REMOVE events here. We are waiting for initialization after all, not de-initialization. We
+ * might see a REMOVE event from an earlier use of the device (devices by the same name are recycled
+ * by the kernel after all), which we should not get confused by. After all we cannot distinguish use
+ * cycles of the devices, as the udev queue is entirely asynchronous.
+ *
+ * If we see a REMOVE event here for the use cycle we actually care about then we won't notice of
+ * course, but that should be OK, given the timeout logic used on the wait loop: this will be noticed
+ * by means of -ETIMEDOUT. Thus we won't notice immediately, but eventually, and that should be
+ * sufficient for an error path that should regularly not happen.
+ *
+ * (And yes, we only need to special case REMOVE. It's the only "negative" event type, where a device
+ * ceases to exist. All other event types are "positive": the device exists and is registered in the
+ * udev database, thus whenever we see the event, we can consider it initialized.) */
+ if (device_for_action(device, SD_DEVICE_REMOVE))
+ return 0;
+
+ if (data->sysname && sd_device_get_sysname(device, &sysname) >= 0 && streq(sysname, data->sysname))
+ goto found;
+
+ if (data->devlink) {
+ const char *devlink;
+
+ FOREACH_DEVICE_DEVLINK(device, link)
+ if (path_equal(link, data->devlink))
+ goto found;
+
+ if (sd_device_get_devname(device, &devlink) >= 0 && path_equal(devlink, data->devlink))
+ goto found;
+ }
+
+ return 0;
+
+found:
+ data->device = sd_device_ref(device);
+ return sd_event_exit(sd_device_monitor_get_event(monitor), 0);
+}
+
+static int device_wait_for_initialization_internal(
+ sd_device *_device,
+ const char *devlink,
+ const char *subsystem,
+ usec_t timeout_usec,
+ sd_device **ret) {
+
+ _cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
+ _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+ /* Ensure that if !_device && devlink, device gets unrefd on errors since it will be new */
+ _cleanup_(sd_device_unrefp) sd_device *device = sd_device_ref(_device);
+ _cleanup_(device_monitor_data_free) struct DeviceMonitorData data = {
+ .devlink = devlink,
+ };
+ int r;
+
+ assert(device || (subsystem && devlink));
+
+ /* Devlink might already exist, if it does get the device to use the sysname filtering */
+ if (!device && devlink) {
+ r = sd_device_new_from_devname(&device, devlink);
+ if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r))
+ return log_error_errno(r, "Failed to create sd-device object from %s: %m", devlink);
+ }
+
+ if (device) {
+ if (sd_device_get_is_initialized(device) > 0) {
+ if (ret)
+ *ret = sd_device_ref(device);
+ return 0;
+ }
+ /* We need either the sysname or the devlink for filtering */
+ assert_se(sd_device_get_sysname(device, &data.sysname) >= 0 || devlink);
+ }
+
+ /* Wait until the device is initialized, so that we can get access to the ID_PATH property */
+
+ r = sd_event_new(&event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get default event: %m");
+
+ r = sd_device_monitor_new(&monitor);
+ if (r < 0)
+ return log_error_errno(r, "Failed to acquire monitor: %m");
+
+ if (device && !subsystem) {
+ r = sd_device_get_subsystem(device, &subsystem);
+ if (r < 0 && r != -ENOENT)
+ return log_device_error_errno(device, r, "Failed to get subsystem: %m");
+ }
+
+ if (subsystem) {
+ r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, subsystem, NULL);
+ if (r < 0)
+ return log_error_errno(r, "Failed to add %s subsystem match to monitor: %m", subsystem);
+ }
+
+ _cleanup_free_ char *desc = NULL;
+ const char *sysname = NULL;
+ if (device)
+ (void) sd_device_get_sysname(device, &sysname);
+
+ desc = strjoin(sysname ?: subsystem, devlink ? ":" : ":initialization", devlink);
+ if (desc)
+ (void) sd_device_monitor_set_description(monitor, desc);
+
+ r = sd_device_monitor_attach_event(monitor, event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to attach event to device monitor: %m");
+
+ r = sd_device_monitor_start(monitor, device_monitor_handler, &data);
+ if (r < 0)
+ return log_error_errno(r, "Failed to start device monitor: %m");
+
+ if (timeout_usec != USEC_INFINITY) {
+ r = sd_event_add_time_relative(
+ event, NULL,
+ CLOCK_MONOTONIC, timeout_usec, 0,
+ NULL, INT_TO_PTR(-ETIMEDOUT));
+ if (r < 0)
+ return log_error_errno(r, "Failed to add timeout event source: %m");
+ }
+
+ /* Check again, maybe things changed. Udev will re-read the db if the device wasn't initialized yet. */
+ if (!device && devlink) {
+ r = sd_device_new_from_devname(&device, devlink);
+ if (r < 0 && !ERRNO_IS_DEVICE_ABSENT(r))
+ return log_error_errno(r, "Failed to create sd-device object from %s: %m", devlink);
+ }
+ if (device && sd_device_get_is_initialized(device) > 0) {
+ if (ret)
+ *ret = sd_device_ref(device);
+ return 0;
+ }
+
+ r = sd_event_loop(event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to wait for device to be initialized: %m");
+
+ if (ret)
+ *ret = TAKE_PTR(data.device);
+ return 0;
+}
+
+int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout_usec, sd_device **ret) {
+ return device_wait_for_initialization_internal(device, NULL, subsystem, timeout_usec, ret);
+}
+
+int device_wait_for_devlink(const char *devlink, const char *subsystem, usec_t timeout_usec, sd_device **ret) {
+ return device_wait_for_initialization_internal(NULL, devlink, subsystem, timeout_usec, ret);
+}
+
+int device_is_renaming(sd_device *dev) {
+ int r;
+
+ assert(dev);
+
+ r = sd_device_get_property_value(dev, "ID_RENAMING", NULL);
+ if (r == -ENOENT)
+ return false;
+ if (r < 0)
+ return r;
+
+ return true;
+}
+
+bool device_for_action(sd_device *dev, sd_device_action_t a) {
+ sd_device_action_t b;
+
+ assert(dev);
+
+ if (a < 0)
+ return false;
+
+ if (sd_device_get_action(dev, &b) < 0)
+ return false;
+
+ return a == b;
+}
+
+void log_device_uevent(sd_device *device, const char *str) {
+ sd_device_action_t action = _SD_DEVICE_ACTION_INVALID;
+ sd_id128_t event_id = SD_ID128_NULL;
+ uint64_t seqnum = 0;
+
+ if (!DEBUG_LOGGING)
+ return;
+
+ (void) sd_device_get_seqnum(device, &seqnum);
+ (void) sd_device_get_action(device, &action);
+ (void) sd_device_get_trigger_uuid(device, &event_id);
+ log_device_debug(device, "%s%s(SEQNUM=%"PRIu64", ACTION=%s%s%s)",
+ strempty(str), isempty(str) ? "" : " ",
+ seqnum, strna(device_action_to_string(action)),
+ sd_id128_is_null(event_id) ? "" : ", UUID=",
+ sd_id128_is_null(event_id) ? "" : SD_ID128_TO_UUID_STRING(event_id));
+}
+
+size_t udev_replace_whitespace(const char *str, char *to, size_t len) {
+ bool is_space = false;
+ size_t i, j;
+
+ assert(str);
+ assert(to);
+
+ /* Copy from 'str' to 'to', while removing all leading and trailing whitespace, and replacing
+ * each run of consecutive whitespace with a single underscore. The chars from 'str' are copied
+ * up to the \0 at the end of the string, or at most 'len' chars. This appends \0 to 'to', at
+ * the end of the copied characters.
+ *
+ * If 'len' chars are copied into 'to', the final \0 is placed at len+1 (i.e. 'to[len] = \0'),
+ * so the 'to' buffer must have at least len+1 chars available.
+ *
+ * Note this may be called with 'str' == 'to', i.e. to replace whitespace in-place in a buffer.
+ * This function can handle that situation.
+ *
+ * Note that only 'len' characters are read from 'str'. */
+
+ i = strspn(str, WHITESPACE);
+
+ for (j = 0; j < len && i < len && str[i] != '\0'; i++) {
+ if (isspace(str[i])) {
+ is_space = true;
+ continue;
+ }
+
+ if (is_space) {
+ if (j + 1 >= len)
+ break;
+
+ to[j++] = '_';
+ is_space = false;
+ }
+ to[j++] = str[i];
+ }
+
+ to[j] = '\0';
+ return j;
+}
+
+size_t udev_replace_chars(char *str, const char *allow) {
+ size_t i = 0, replaced = 0;
+
+ assert(str);
+
+ /* allow chars in allow list, plain ascii, hex-escaping and valid utf8. */
+
+ while (str[i] != '\0') {
+ int len;
+
+ if (allow_listed_char_for_devnode(str[i], allow)) {
+ i++;
+ continue;
+ }
+
+ /* accept hex encoding */
+ if (str[i] == '\\' && str[i+1] == 'x') {
+ i += 2;
+ continue;
+ }
+
+ /* accept valid utf8 */
+ len = utf8_encoded_valid_unichar(str + i, SIZE_MAX);
+ if (len > 1) {
+ i += len;
+ continue;
+ }
+
+ /* if space is allowed, replace whitespace with ordinary space */
+ if (isspace(str[i]) && allow && strchr(allow, ' ')) {
+ str[i] = ' ';
+ i++;
+ replaced++;
+ continue;
+ }
+
+ /* everything else is replaced with '_' */
+ str[i] = '_';
+ i++;
+ replaced++;
+ }
+ return replaced;
+}
+
+int udev_queue_is_empty(void) {
+ return access("/run/udev/queue", F_OK) < 0 ?
+ (errno == ENOENT ? true : -errno) : false;
+}
+
+bool udev_available(void) {
+ static int cache = -1;
+
+ /* The service systemd-udevd is started only when /sys is read write.
+ * See systemd-udevd.service: ConditionPathIsReadWrite=/sys
+ * Also, our container interface (http://systemd.io/CONTAINER_INTERFACE/) states that /sys must
+ * be mounted in read-only mode in containers. */
+
+ if (cache >= 0)
+ return cache;
+
+ return (cache = (path_is_read_only_fs("/sys/") <= 0));
+}
+
+int device_get_vendor_string(sd_device *device, const char **ret) {
+ int r;
+
+ assert(device);
+
+ FOREACH_STRING(field, "ID_VENDOR_FROM_DATABASE", "ID_VENDOR") {
+ r = sd_device_get_property_value(device, field, ret);
+ if (r != -ENOENT)
+ return r;
+ }
+
+ return -ENOENT;
+}
+
+int device_get_model_string(sd_device *device, const char **ret) {
+ int r;
+
+ assert(device);
+
+ FOREACH_STRING(field, "ID_MODEL_FROM_DATABASE", "ID_MODEL") {
+ r = sd_device_get_property_value(device, field, ret);
+ if (r != -ENOENT)
+ return r;
+ }
+
+ return -ENOENT;
+}
+
+int device_get_property_value_with_fallback(
+ sd_device *device,
+ const char *prop,
+ Hashmap *extra_props,
+ const char **ret) {
+ const char *value;
+ int r;
+
+ assert(device);
+ assert(prop);
+ assert(ret);
+
+ r = sd_device_get_property_value(device, prop, &value);
+ if (r < 0) {
+ if (r != -ENOENT)
+ return r;
+
+ value = hashmap_get(extra_props, prop);
+ if (!value)
+ return -ENOENT;
+ }
+
+ *ret = value;
+
+ return 1;
+}
diff --git a/src/shared/udev-util.h b/src/shared/udev-util.h
new file mode 100644
index 0000000..651d335
--- /dev/null
+++ b/src/shared/udev-util.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-device.h"
+
+#include "hashmap.h"
+#include "time-util.h"
+
+int udev_set_max_log_level(char *str);
+int udev_parse_config(void);
+
+int device_wait_for_initialization(sd_device *device, const char *subsystem, usec_t timeout_usec, sd_device **ret);
+int device_wait_for_devlink(const char *path, const char *subsystem, usec_t timeout_usec, sd_device **ret);
+int device_is_renaming(sd_device *dev);
+
+bool device_for_action(sd_device *dev, sd_device_action_t action);
+
+void log_device_uevent(sd_device *device, const char *str);
+
+size_t udev_replace_whitespace(const char *str, char *to, size_t len);
+size_t udev_replace_chars(char *str, const char *allow);
+
+int udev_queue_is_empty(void);
+
+bool udev_available(void);
+
+int device_get_vendor_string(sd_device *device, const char **ret);
+int device_get_model_string(sd_device *device, const char **ret);
+
+int device_get_property_value_with_fallback(
+ sd_device *device,
+ const char *prop,
+ Hashmap *extra_props,
+ const char **ret);
diff --git a/src/shared/user-record-nss.c b/src/shared/user-record-nss.c
new file mode 100644
index 0000000..414a493
--- /dev/null
+++ b/src/shared/user-record-nss.c
@@ -0,0 +1,529 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "errno-util.h"
+#include "format-util.h"
+#include "libcrypt-util.h"
+#include "strv.h"
+#include "user-record-nss.h"
+#include "user-util.h"
+#include "utf8.h"
+
+#define SET_IF(field, condition, value, fallback) \
+ field = (condition) ? (value) : (fallback)
+
+static const char* utf8_only(const char *s) {
+ return s && utf8_is_valid(s) ? s : NULL;
+}
+
+static int strv_extend_strv_utf8_only(char ***dst, char **src, bool filter_duplicates) {
+ _cleanup_free_ char **t = NULL;
+ size_t l, j = 0;
+
+ /* First, do a shallow copy of s, filtering for only valid utf-8 strings */
+ l = strv_length(src);
+ t = new(char*, l + 1);
+ if (!t)
+ return -ENOMEM;
+
+ for (size_t i = 0; i < l; i++)
+ if (utf8_is_valid(src[i]))
+ t[j++] = src[i];
+ if (j == 0)
+ return 0;
+
+ t[j] = NULL;
+ return strv_extend_strv(dst, t, filter_duplicates);
+}
+
+int nss_passwd_to_user_record(
+ const struct passwd *pwd,
+ const struct spwd *spwd,
+ UserRecord **ret) {
+
+ _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+ int r;
+
+ assert(pwd);
+
+ if (isempty(pwd->pw_name))
+ return -EINVAL;
+
+ if (spwd && !streq_ptr(spwd->sp_namp, pwd->pw_name))
+ return -EINVAL;
+
+ hr = user_record_new();
+ if (!hr)
+ return -ENOMEM;
+
+ r = free_and_strdup(&hr->user_name, pwd->pw_name);
+ if (r < 0)
+ return r;
+
+ /* Some bad NSS modules synthesize GECOS fields with embedded ":" or "\n" characters, which are not
+ * something we can output in /etc/passwd compatible format, since these are record separators
+ * there. We normally refuse that, but we need to maintain compatibility with arbitrary NSS modules,
+ * hence let's do what glibc does: mangle the data to fit the format. */
+ if (isempty(pwd->pw_gecos) || streq_ptr(pwd->pw_gecos, hr->user_name))
+ hr->real_name = mfree(hr->real_name);
+ else if (valid_gecos(pwd->pw_gecos)) {
+ r = free_and_strdup(&hr->real_name, pwd->pw_gecos);
+ if (r < 0)
+ return r;
+ } else {
+ _cleanup_free_ char *mangled = NULL;
+
+ mangled = mangle_gecos(pwd->pw_gecos);
+ if (!mangled)
+ return -ENOMEM;
+
+ free_and_replace(hr->real_name, mangled);
+ }
+
+ r = free_and_strdup(&hr->home_directory, utf8_only(empty_to_null(pwd->pw_dir)));
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&hr->shell, utf8_only(empty_to_null(pwd->pw_shell)));
+ if (r < 0)
+ return r;
+
+ hr->uid = pwd->pw_uid;
+ hr->gid = pwd->pw_gid;
+
+ if (spwd &&
+ looks_like_hashed_password(utf8_only(spwd->sp_pwdp))) { /* Ignore locked, disabled, and mojibake passwords */
+ strv_free_erase(hr->hashed_password);
+ hr->hashed_password = strv_new(spwd->sp_pwdp);
+ if (!hr->hashed_password)
+ return -ENOMEM;
+ } else
+ hr->hashed_password = strv_free_erase(hr->hashed_password);
+
+ /* shadow-utils suggests using "chage -E 0" (or -E 1, depending on which man page you check)
+ * for locking a whole account, hence check for that. Note that it also defines a way to lock
+ * just a password instead of the whole account, but that's mostly pointless in times of
+ * password-less authorization, hence let's not bother. */
+
+ SET_IF(hr->locked,
+ spwd && spwd->sp_expire >= 0,
+ spwd->sp_expire <= 1, -1);
+
+ SET_IF(hr->not_after_usec,
+ spwd && spwd->sp_expire > 1 && (uint64_t) spwd->sp_expire < (UINT64_MAX-1)/USEC_PER_DAY,
+ spwd->sp_expire * USEC_PER_DAY, UINT64_MAX);
+
+ SET_IF(hr->password_change_now,
+ spwd && spwd->sp_lstchg >= 0,
+ spwd->sp_lstchg == 0, -1);
+
+ SET_IF(hr->last_password_change_usec,
+ spwd && spwd->sp_lstchg > 0 && (uint64_t) spwd->sp_lstchg <= (UINT64_MAX-1)/USEC_PER_DAY,
+ spwd->sp_lstchg * USEC_PER_DAY, UINT64_MAX);
+
+ SET_IF(hr->password_change_min_usec,
+ spwd && spwd->sp_min > 0 && (uint64_t) spwd->sp_min <= (UINT64_MAX-1)/USEC_PER_DAY,
+ spwd->sp_min * USEC_PER_DAY, UINT64_MAX);
+
+ SET_IF(hr->password_change_max_usec,
+ spwd && spwd->sp_max > 0 && (uint64_t) spwd->sp_max <= (UINT64_MAX-1)/USEC_PER_DAY,
+ spwd->sp_max * USEC_PER_DAY, UINT64_MAX);
+
+ SET_IF(hr->password_change_warn_usec,
+ spwd && spwd->sp_warn > 0 && (uint64_t) spwd->sp_warn <= (UINT64_MAX-1)/USEC_PER_DAY,
+ spwd->sp_warn * USEC_PER_DAY, UINT64_MAX);
+
+ SET_IF(hr->password_change_inactive_usec,
+ spwd && spwd->sp_inact > 0 && (uint64_t) spwd->sp_inact <= (UINT64_MAX-1)/USEC_PER_DAY,
+ spwd->sp_inact * USEC_PER_DAY, UINT64_MAX);
+
+ hr->json = json_variant_unref(hr->json);
+ r = json_build(&hr->json, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(hr->user_name)),
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(hr->uid)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(hr->gid)),
+ JSON_BUILD_PAIR_CONDITION(hr->real_name, "realName", JSON_BUILD_STRING(hr->real_name)),
+ JSON_BUILD_PAIR_CONDITION(hr->home_directory, "homeDirectory", JSON_BUILD_STRING(hr->home_directory)),
+ JSON_BUILD_PAIR_CONDITION(hr->shell, "shell", JSON_BUILD_STRING(hr->shell)),
+ JSON_BUILD_PAIR_CONDITION(!strv_isempty(hr->hashed_password), "privileged", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRV(hr->hashed_password)))),
+ JSON_BUILD_PAIR_CONDITION(hr->locked >= 0, "locked", JSON_BUILD_BOOLEAN(hr->locked)),
+ JSON_BUILD_PAIR_CONDITION(hr->not_after_usec != UINT64_MAX, "notAfterUSec", JSON_BUILD_UNSIGNED(hr->not_after_usec)),
+ JSON_BUILD_PAIR_CONDITION(hr->password_change_now >= 0, "passwordChangeNow", JSON_BUILD_BOOLEAN(hr->password_change_now)),
+ JSON_BUILD_PAIR_CONDITION(hr->last_password_change_usec != UINT64_MAX, "lastPasswordChangeUSec", JSON_BUILD_UNSIGNED(hr->last_password_change_usec)),
+ JSON_BUILD_PAIR_CONDITION(hr->password_change_min_usec != UINT64_MAX, "passwordChangeMinUSec", JSON_BUILD_UNSIGNED(hr->password_change_min_usec)),
+ JSON_BUILD_PAIR_CONDITION(hr->password_change_max_usec != UINT64_MAX, "passwordChangeMaxUSec", JSON_BUILD_UNSIGNED(hr->password_change_max_usec)),
+ JSON_BUILD_PAIR_CONDITION(hr->password_change_warn_usec != UINT64_MAX, "passwordChangeWarnUSec", JSON_BUILD_UNSIGNED(hr->password_change_warn_usec)),
+ JSON_BUILD_PAIR_CONDITION(hr->password_change_inactive_usec != UINT64_MAX, "passwordChangeInactiveUSec", JSON_BUILD_UNSIGNED(hr->password_change_inactive_usec))));
+
+ if (r < 0)
+ return r;
+
+ hr->mask = USER_RECORD_REGULAR |
+ (!strv_isempty(hr->hashed_password) ? USER_RECORD_PRIVILEGED : 0);
+
+ if (ret)
+ *ret = TAKE_PTR(hr);
+ return 0;
+}
+
+int nss_spwd_for_passwd(const struct passwd *pwd, struct spwd *ret_spwd, char **ret_buffer) {
+ size_t buflen = 4096;
+ int r;
+
+ assert(pwd);
+ assert(ret_spwd);
+ assert(ret_buffer);
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ struct spwd spwd, *result;
+
+ buf = malloc(buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ r = getspnam_r(pwd->pw_name, &spwd, buf, buflen, &result);
+ if (r == 0) {
+ if (!result)
+ return -ESRCH;
+
+ *ret_spwd = *result;
+ *ret_buffer = TAKE_PTR(buf);
+ return 0;
+ }
+ if (r < 0)
+ return -EIO; /* Weird, this should not return negative! */
+ if (r != ERANGE)
+ return -r;
+
+ if (buflen > SIZE_MAX / 2)
+ return -ERANGE;
+
+ buflen *= 2;
+ buf = mfree(buf);
+ }
+}
+
+int nss_user_record_by_name(
+ const char *name,
+ bool with_shadow,
+ UserRecord **ret) {
+
+ _cleanup_free_ char *buf = NULL, *sbuf = NULL;
+ struct passwd pwd, *result;
+ bool incomplete = false;
+ size_t buflen = 4096;
+ struct spwd spwd, *sresult = NULL;
+ int r;
+
+ assert(name);
+
+ for (;;) {
+ buf = malloc(buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ r = getpwnam_r(name, &pwd, buf, buflen, &result);
+ if (r == 0) {
+ if (!result)
+ return -ESRCH;
+
+ break;
+ }
+
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getpwnam_r() returned a negative value");
+ if (r != ERANGE)
+ return -r;
+
+ if (buflen > SIZE_MAX / 2)
+ return -ERANGE;
+
+ buflen *= 2;
+ buf = mfree(buf);
+ }
+
+ if (with_shadow) {
+ r = nss_spwd_for_passwd(result, &spwd, &sbuf);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to do shadow lookup for user %s, ignoring: %m", name);
+ incomplete = ERRNO_IS_PRIVILEGE(r);
+ } else
+ sresult = &spwd;
+ } else
+ incomplete = true;
+
+ r = nss_passwd_to_user_record(result, sresult, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->incomplete = incomplete;
+ return 0;
+}
+
+int nss_user_record_by_uid(
+ uid_t uid,
+ bool with_shadow,
+ UserRecord **ret) {
+
+ _cleanup_free_ char *buf = NULL, *sbuf = NULL;
+ struct passwd pwd, *result;
+ bool incomplete = false;
+ size_t buflen = 4096;
+ struct spwd spwd, *sresult = NULL;
+ int r;
+
+ for (;;) {
+ buf = malloc(buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ r = getpwuid_r(uid, &pwd, buf, buflen, &result);
+ if (r == 0) {
+ if (!result)
+ return -ESRCH;
+
+ break;
+ }
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getpwuid_r() returned a negative value");
+ if (r != ERANGE)
+ return -r;
+
+ if (buflen > SIZE_MAX / 2)
+ return -ERANGE;
+
+ buflen *= 2;
+ buf = mfree(buf);
+ }
+
+ if (with_shadow) {
+ r = nss_spwd_for_passwd(result, &spwd, &sbuf);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to do shadow lookup for UID " UID_FMT ", ignoring: %m", uid);
+ incomplete = ERRNO_IS_PRIVILEGE(r);
+ } else
+ sresult = &spwd;
+ } else
+ incomplete = true;
+
+ r = nss_passwd_to_user_record(result, sresult, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->incomplete = incomplete;
+ return 0;
+}
+
+int nss_group_to_group_record(
+ const struct group *grp,
+ const struct sgrp *sgrp,
+ GroupRecord **ret) {
+
+ _cleanup_(group_record_unrefp) GroupRecord *g = NULL;
+ int r;
+
+ assert(grp);
+
+ if (isempty(grp->gr_name))
+ return -EINVAL;
+
+ if (sgrp && !streq_ptr(sgrp->sg_namp, grp->gr_name))
+ return -EINVAL;
+
+ g = group_record_new();
+ if (!g)
+ return -ENOMEM;
+
+ g->group_name = strdup(grp->gr_name);
+ if (!g->group_name)
+ return -ENOMEM;
+
+ r = strv_extend_strv_utf8_only(&g->members, grp->gr_mem, false);
+ if (r < 0)
+ return r;
+
+ g->gid = grp->gr_gid;
+
+ if (sgrp) {
+ if (looks_like_hashed_password(utf8_only(sgrp->sg_passwd))) {
+ g->hashed_password = strv_new(sgrp->sg_passwd);
+ if (!g->hashed_password)
+ return -ENOMEM;
+ }
+
+ r = strv_extend_strv_utf8_only(&g->members, sgrp->sg_mem, true);
+ if (r < 0)
+ return r;
+
+ r = strv_extend_strv_utf8_only(&g->administrators, sgrp->sg_adm, false);
+ if (r < 0)
+ return r;
+ }
+
+ r = json_build(&g->json, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(g->group_name)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(g->gid)),
+ JSON_BUILD_PAIR_CONDITION(!strv_isempty(g->members), "members", JSON_BUILD_STRV(g->members)),
+ JSON_BUILD_PAIR_CONDITION(!strv_isempty(g->hashed_password), "privileged", JSON_BUILD_OBJECT(JSON_BUILD_PAIR("hashedPassword", JSON_BUILD_STRV(g->hashed_password)))),
+ JSON_BUILD_PAIR_CONDITION(!strv_isempty(g->administrators), "administrators", JSON_BUILD_STRV(g->administrators))));
+ if (r < 0)
+ return r;
+
+ g->mask = USER_RECORD_REGULAR |
+ (!strv_isempty(g->hashed_password) ? USER_RECORD_PRIVILEGED : 0);
+
+ if (ret)
+ *ret = TAKE_PTR(g);
+ return 0;
+}
+
+int nss_sgrp_for_group(const struct group *grp, struct sgrp *ret_sgrp, char **ret_buffer) {
+ size_t buflen = 4096;
+ int r;
+
+ assert(grp);
+ assert(ret_sgrp);
+ assert(ret_buffer);
+
+ for (;;) {
+ _cleanup_free_ char *buf = NULL;
+ struct sgrp sgrp, *result;
+
+ buf = malloc(buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ r = getsgnam_r(grp->gr_name, &sgrp, buf, buflen, &result);
+ if (r == 0) {
+ if (!result)
+ return -ESRCH;
+
+ *ret_sgrp = *result;
+ *ret_buffer = TAKE_PTR(buf);
+ return 0;
+ }
+ if (r < 0)
+ return -EIO; /* Weird, this should not return negative! */
+ if (r != ERANGE)
+ return -r;
+
+ if (buflen > SIZE_MAX / 2)
+ return -ERANGE;
+
+ buflen *= 2;
+ buf = mfree(buf);
+ }
+}
+
+int nss_group_record_by_name(
+ const char *name,
+ bool with_shadow,
+ GroupRecord **ret) {
+
+ _cleanup_free_ char *buf = NULL, *sbuf = NULL;
+ struct group grp, *result;
+ bool incomplete = false;
+ size_t buflen = 4096;
+ struct sgrp sgrp, *sresult = NULL;
+ int r;
+
+ assert(name);
+
+ for (;;) {
+ buf = malloc(buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ r = getgrnam_r(name, &grp, buf, buflen, &result);
+ if (r == 0) {
+ if (!result)
+ return -ESRCH;
+
+ break;
+ }
+
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getgrnam_r() returned a negative value");
+ if (r != ERANGE)
+ return -r;
+ if (buflen > SIZE_MAX / 2)
+ return -ERANGE;
+
+ buflen *= 2;
+ buf = mfree(buf);
+ }
+
+ if (with_shadow) {
+ r = nss_sgrp_for_group(result, &sgrp, &sbuf);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to do shadow lookup for group %s, ignoring: %m", result->gr_name);
+ incomplete = ERRNO_IS_PRIVILEGE(r);
+ } else
+ sresult = &sgrp;
+ } else
+ incomplete = true;
+
+ r = nss_group_to_group_record(result, sresult, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->incomplete = incomplete;
+ return 0;
+}
+
+int nss_group_record_by_gid(
+ gid_t gid,
+ bool with_shadow,
+ GroupRecord **ret) {
+
+ _cleanup_free_ char *buf = NULL, *sbuf = NULL;
+ struct group grp, *result;
+ bool incomplete = false;
+ size_t buflen = 4096;
+ struct sgrp sgrp, *sresult = NULL;
+ int r;
+
+ for (;;) {
+ buf = malloc(buflen);
+ if (!buf)
+ return -ENOMEM;
+
+ r = getgrgid_r(gid, &grp, buf, buflen, &result);
+ if (r == 0) {
+ if (!result)
+ return -ESRCH;
+ break;
+ }
+
+ if (r < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EIO), "getgrgid_r() returned a negative value");
+ if (r != ERANGE)
+ return -r;
+ if (buflen > SIZE_MAX / 2)
+ return -ERANGE;
+
+ buflen *= 2;
+ buf = mfree(buf);
+ }
+
+ if (with_shadow) {
+ r = nss_sgrp_for_group(result, &sgrp, &sbuf);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to do shadow lookup for group %s, ignoring: %m", result->gr_name);
+ incomplete = ERRNO_IS_PRIVILEGE(r);
+ } else
+ sresult = &sgrp;
+ } else
+ incomplete = true;
+
+ r = nss_group_to_group_record(result, sresult, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->incomplete = incomplete;
+ return 0;
+}
diff --git a/src/shared/user-record-nss.h b/src/shared/user-record-nss.h
new file mode 100644
index 0000000..22ab04d
--- /dev/null
+++ b/src/shared/user-record-nss.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <grp.h>
+#include <gshadow.h>
+#include <pwd.h>
+#include <shadow.h>
+
+#include "group-record.h"
+#include "user-record.h"
+
+/* Synthesize UserRecord and GroupRecord objects from NSS data */
+
+int nss_passwd_to_user_record(const struct passwd *pwd, const struct spwd *spwd, UserRecord **ret);
+int nss_spwd_for_passwd(const struct passwd *pwd, struct spwd *ret_spwd, char **ret_buffer);
+
+int nss_user_record_by_name(const char *name, bool with_shadow, UserRecord **ret);
+int nss_user_record_by_uid(uid_t uid, bool with_shadow, UserRecord **ret);
+
+int nss_group_to_group_record(const struct group *grp, const struct sgrp *sgrp, GroupRecord **ret);
+int nss_sgrp_for_group(const struct group *grp, struct sgrp *ret_sgrp, char **ret_buffer);
+
+int nss_group_record_by_name(const char *name, bool with_shadow, GroupRecord **ret);
+int nss_group_record_by_gid(gid_t gid, bool with_shadow, GroupRecord **ret);
diff --git a/src/shared/user-record-show.c b/src/shared/user-record-show.c
new file mode 100644
index 0000000..28fa7a8
--- /dev/null
+++ b/src/shared/user-record-show.c
@@ -0,0 +1,601 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "cap-list.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "process-util.h"
+#include "rlimit-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "user-record-show.h"
+#include "user-util.h"
+#include "userdb.h"
+
+const char *user_record_state_color(const char *state) {
+ if (STR_IN_SET(state, "unfixated", "absent"))
+ return ansi_grey();
+ else if (streq(state, "active"))
+ return ansi_highlight_green();
+ else if (STR_IN_SET(state, "locked", "dirty"))
+ return ansi_highlight_yellow();
+
+ return NULL;
+}
+
+void user_record_show(UserRecord *hr, bool show_full_group_info) {
+ const char *hd, *ip, *shell;
+ UserStorage storage;
+ usec_t t;
+ size_t k;
+ int r, b;
+
+ printf(" User name: %s\n",
+ user_record_user_name_and_realm(hr));
+
+ if (hr->state) {
+ const char *color;
+
+ color = user_record_state_color(hr->state);
+
+ printf(" State: %s%s%s\n",
+ strempty(color), hr->state, color ? ansi_normal() : "");
+ }
+
+ printf(" Disposition: %s\n", user_disposition_to_string(user_record_disposition(hr)));
+
+ if (hr->last_change_usec != USEC_INFINITY) {
+ printf(" Last Change: %s\n", FORMAT_TIMESTAMP(hr->last_change_usec));
+
+ if (hr->last_change_usec > now(CLOCK_REALTIME))
+ printf(" %sModification time lies in the future, system clock wrong?%s\n",
+ ansi_highlight_yellow(), ansi_normal());
+ }
+
+ if (hr->last_password_change_usec != USEC_INFINITY &&
+ hr->last_password_change_usec != hr->last_change_usec)
+ printf(" Last Passw.: %s\n", FORMAT_TIMESTAMP(hr->last_password_change_usec));
+
+ r = user_record_test_blocked(hr);
+ switch (r) {
+
+ case -ENOLCK:
+ printf(" Login OK: %sno%s (record is locked)\n", ansi_highlight_red(), ansi_normal());
+ break;
+
+ case -EL2HLT:
+ printf(" Login OK: %sno%s (record not valid yet))\n", ansi_highlight_red(), ansi_normal());
+ break;
+
+ case -EL3HLT:
+ printf(" Login OK: %sno%s (record not valid anymore))\n", ansi_highlight_red(), ansi_normal());
+ break;
+
+ case -ESTALE:
+ default: {
+ usec_t y;
+
+ if (r < 0 && r != -ESTALE) {
+ errno = -r;
+ printf(" Login OK: %sno%s (%m)\n", ansi_highlight_red(), ansi_normal());
+ break;
+ }
+
+ if (is_nologin_shell(user_record_shell(hr))) {
+ printf(" Login OK: %sno%s (nologin shell)\n", ansi_highlight_red(), ansi_normal());
+ break;
+ }
+
+ y = user_record_ratelimit_next_try(hr);
+ if (y != USEC_INFINITY && y > now(CLOCK_REALTIME)) {
+ printf(" Login OK: %sno%s (ratelimit)\n", ansi_highlight_red(), ansi_normal());
+ break;
+ }
+
+ printf(" Login OK: %syes%s\n", ansi_highlight_green(), ansi_normal());
+ break;
+ }}
+
+ r = user_record_test_password_change_required(hr);
+ switch (r) {
+
+ case -EKEYREVOKED:
+ printf(" Password OK: %schange now%s\n", ansi_highlight_yellow(), ansi_normal());
+ break;
+
+ case -EOWNERDEAD:
+ printf(" Password OK: %sexpired%s (change now!)\n", ansi_highlight_yellow(), ansi_normal());
+ break;
+
+ case -EKEYREJECTED:
+ printf(" Password OK: %sexpired%s (for good)\n", ansi_highlight_red(), ansi_normal());
+ break;
+
+ case -EKEYEXPIRED:
+ printf(" Password OK: %sexpires soon%s\n", ansi_highlight_yellow(), ansi_normal());
+ break;
+
+ case -ENETDOWN:
+ printf(" Password OK: %sno timestamp%s\n", ansi_highlight_red(), ansi_normal());
+ break;
+
+ case -EROFS:
+ printf(" Password OK: %schange not permitted%s\n", ansi_highlight_yellow(), ansi_normal());
+ break;
+
+ case -ESTALE:
+ printf(" Password OK: %slast password change in future%s\n", ansi_highlight_yellow(), ansi_normal());
+ break;
+
+ default:
+ if (r < 0) {
+ errno = -r;
+ printf(" Password OK: %sno%s (%m)\n", ansi_highlight_yellow(), ansi_normal());
+ break;
+ }
+
+ if (strv_isempty(hr->hashed_password)) {
+ if (hr->incomplete) /* Record might be incomplete, due to privs */
+ break;
+ printf(" Password OK: %sno%s (none set)\n", ansi_highlight(), ansi_normal());
+ break;
+ }
+ if (strv_contains(hr->hashed_password, "")) {
+ printf(" Password OK: %sno%s (empty set)\n", ansi_highlight_red(), ansi_normal());
+ break;
+ }
+ bool has_valid_passwords = false;
+ STRV_FOREACH(p, hr->hashed_password)
+ if (!hashed_password_is_locked_or_invalid(*p)) {
+ has_valid_passwords = true;
+ break;
+ }
+ if (has_valid_passwords)
+ printf(" Password OK: %syes%s\n", ansi_highlight_green(), ansi_normal());
+ else
+ printf(" Password OK: %sno%s (locked)\n", ansi_highlight(), ansi_normal());
+ }
+ if (uid_is_valid(hr->uid))
+ printf(" UID: " UID_FMT "\n", hr->uid);
+ if (gid_is_valid(hr->gid)) {
+ if (show_full_group_info) {
+ _cleanup_(group_record_unrefp) GroupRecord *gr = NULL;
+
+ r = groupdb_by_gid(hr->gid, 0, &gr);
+ if (r < 0) {
+ errno = -r;
+ printf(" GID: " GID_FMT " (unresolvable: %m)\n", hr->gid);
+ } else
+ printf(" GID: " GID_FMT " (%s)\n", hr->gid, gr->group_name);
+ } else
+ printf(" GID: " GID_FMT "\n", hr->gid);
+ } else if (uid_is_valid(hr->uid)) /* Show UID as GID if not separately configured */
+ printf(" GID: " GID_FMT "\n", (gid_t) hr->uid);
+
+ if (show_full_group_info) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+
+ r = membershipdb_by_user(hr->user_name, 0, &iterator);
+ if (r < 0) {
+ errno = -r;
+ printf(" Aux. Groups: (can't acquire: %m)\n");
+ } else {
+ const char *prefix = " Aux. Groups:";
+
+ for (;;) {
+ _cleanup_free_ char *group = NULL;
+
+ r = membershipdb_iterator_get(iterator, NULL, &group);
+ if (r == -ESRCH)
+ break;
+ if (r < 0) {
+ errno = -r;
+ printf("%s (can't iterate: %m)\n", prefix);
+ break;
+ }
+
+ printf("%s %s\n", prefix, group);
+ prefix = " ";
+ }
+ }
+ }
+
+ if (hr->real_name && !streq(hr->real_name, hr->user_name))
+ printf(" Real Name: %s\n", hr->real_name);
+
+ hd = user_record_home_directory(hr);
+ if (hd)
+ printf(" Directory: %s\n", hd);
+
+ storage = user_record_storage(hr);
+ if (storage >= 0) /* Let's be political, and clarify which storage we like, and which we don't. About CIFS we don't complain. */
+ printf(" Storage: %s%s\n", user_storage_to_string(storage),
+ storage == USER_LUKS ? " (strong encryption)" :
+ storage == USER_FSCRYPT ? " (weak encryption)" :
+ IN_SET(storage, USER_DIRECTORY, USER_SUBVOLUME) ? " (no encryption)" : "");
+
+ ip = user_record_image_path(hr);
+ if (ip && !streq_ptr(ip, hd))
+ printf(" Image Path: %s\n", ip);
+
+ b = user_record_removable(hr);
+ if (b >= 0)
+ printf(" Removable: %s\n", yes_no(b));
+
+ shell = user_record_shell(hr);
+ if (shell)
+ printf(" Shell: %s\n", shell);
+
+ if (hr->email_address)
+ printf(" Email: %s\n", hr->email_address);
+ if (hr->location)
+ printf(" Location: %s\n", hr->location);
+ if (hr->password_hint)
+ printf(" Passw. Hint: %s\n", hr->password_hint);
+ if (hr->icon_name)
+ printf(" Icon Name: %s\n", hr->icon_name);
+
+ if (hr->time_zone)
+ printf(" Time Zone: %s\n", hr->time_zone);
+
+ if (hr->preferred_language)
+ printf(" Language: %s\n", hr->preferred_language);
+
+ if (!strv_isempty(hr->environment))
+ STRV_FOREACH(i, hr->environment) {
+ printf(i == hr->environment ?
+ " Environment: %s\n" :
+ " %s\n", *i);
+ }
+
+ if (hr->locked >= 0)
+ printf(" Locked: %s\n", yes_no(hr->locked));
+
+ if (hr->not_before_usec != UINT64_MAX)
+ printf(" Not Before: %s\n", FORMAT_TIMESTAMP(hr->not_before_usec));
+
+ if (hr->not_after_usec != UINT64_MAX)
+ printf(" Not After: %s\n", FORMAT_TIMESTAMP(hr->not_after_usec));
+
+ if (hr->umask != MODE_INVALID)
+ printf(" UMask: 0%03o\n", hr->umask);
+
+ if (nice_is_valid(hr->nice_level))
+ printf(" Nice: %i\n", hr->nice_level);
+
+ for (int j = 0; j < _RLIMIT_MAX; j++) {
+ if (hr->rlimits[j])
+ printf(" Limit: RLIMIT_%s=%" PRIu64 ":%" PRIu64 "\n",
+ rlimit_to_string(j), (uint64_t) hr->rlimits[j]->rlim_cur, (uint64_t) hr->rlimits[j]->rlim_max);
+ }
+
+ if (hr->tasks_max != UINT64_MAX)
+ printf(" Tasks Max: %" PRIu64 "\n", hr->tasks_max);
+
+ if (hr->memory_high != UINT64_MAX)
+ printf(" Memory High: %s\n", FORMAT_BYTES(hr->memory_high));
+
+ if (hr->memory_max != UINT64_MAX)
+ printf(" Memory Max: %s\n", FORMAT_BYTES(hr->memory_max));
+
+ if (hr->cpu_weight == CGROUP_WEIGHT_IDLE)
+ printf(" CPU Weight: %s\n", "idle");
+ else if (hr->cpu_weight != UINT64_MAX)
+ printf(" CPU Weight: %" PRIu64 "\n", hr->cpu_weight);
+
+ if (hr->io_weight != UINT64_MAX)
+ printf(" IO Weight: %" PRIu64 "\n", hr->io_weight);
+
+ if (hr->access_mode != MODE_INVALID)
+ printf(" Access Mode: 0%03o\n", user_record_access_mode(hr));
+
+ uint64_t caps = user_record_capability_bounding_set(hr);
+ if (caps != UINT64_MAX) {
+ _cleanup_free_ char *scaps = NULL;
+
+ (void) capability_set_to_string_negative(caps, &scaps);
+ printf(" Bound. Caps: %s\n", strna(scaps));
+ }
+
+ caps = user_record_capability_ambient_set(hr);
+ if (caps != UINT64_MAX) {
+ _cleanup_free_ char *scaps = NULL;
+
+ (void) capability_set_to_string(caps, &scaps);
+ printf("Ambient Caps: %s\n", strna(scaps));
+ }
+
+ if (storage == USER_LUKS) {
+ printf("LUKS Discard: online=%s offline=%s\n", yes_no(user_record_luks_discard(hr)), yes_no(user_record_luks_offline_discard(hr)));
+
+ if (!sd_id128_is_null(hr->luks_uuid))
+ printf(" LUKS UUID: " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(hr->luks_uuid));
+ if (!sd_id128_is_null(hr->partition_uuid))
+ printf(" Part UUID: " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(hr->partition_uuid));
+ if (!sd_id128_is_null(hr->file_system_uuid))
+ printf(" FS UUID: " SD_ID128_UUID_FORMAT_STR "\n", SD_ID128_FORMAT_VAL(hr->file_system_uuid));
+
+ if (hr->file_system_type)
+ printf(" File System: %s\n", user_record_file_system_type(hr));
+
+ if (hr->luks_extra_mount_options)
+ printf("LUKS MntOpts: %s\n", hr->luks_extra_mount_options);
+
+ if (hr->luks_cipher)
+ printf(" LUKS Cipher: %s\n", hr->luks_cipher);
+ if (hr->luks_cipher_mode)
+ printf(" Cipher Mode: %s\n", hr->luks_cipher_mode);
+ if (hr->luks_volume_key_size != UINT64_MAX)
+ printf(" Volume Key: %" PRIu64 "bit\n", hr->luks_volume_key_size * 8);
+
+ if (hr->luks_pbkdf_type)
+ printf(" PBKDF Type: %s\n", hr->luks_pbkdf_type);
+ if (hr->luks_pbkdf_hash_algorithm)
+ printf(" PBKDF Hash: %s\n", hr->luks_pbkdf_hash_algorithm);
+ if (hr->luks_pbkdf_force_iterations != UINT64_MAX)
+ printf(" PBKDF Iters: %" PRIu64 "\n", hr->luks_pbkdf_force_iterations);
+ if (hr->luks_pbkdf_time_cost_usec != UINT64_MAX)
+ printf(" PBKDF Time: %s\n", FORMAT_TIMESPAN(hr->luks_pbkdf_time_cost_usec, 0));
+ if (hr->luks_pbkdf_memory_cost != UINT64_MAX)
+ printf(" PBKDF Bytes: %s\n", FORMAT_BYTES(hr->luks_pbkdf_memory_cost));
+
+ if (hr->luks_pbkdf_parallel_threads != UINT64_MAX)
+ printf("PBKDF Thread: %" PRIu64 "\n", hr->luks_pbkdf_parallel_threads);
+ if (hr->luks_sector_size != UINT64_MAX)
+ printf(" Sector Size: %" PRIu64 "\n", hr->luks_sector_size);
+
+ } else if (storage == USER_CIFS) {
+
+ if (hr->cifs_service)
+ printf("CIFS Service: %s\n", hr->cifs_service);
+
+ if (hr->cifs_extra_mount_options)
+ printf("CIFS MntOpts: %s\n", hr->cifs_extra_mount_options);
+ }
+
+ if (hr->cifs_user_name)
+ printf(" CIFS User: %s\n", user_record_cifs_user_name(hr));
+ if (hr->cifs_domain)
+ printf(" CIFS Domain: %s\n", hr->cifs_domain);
+
+ if (storage != USER_CLASSIC)
+ printf(" Mount Flags: %s %s %s\n",
+ hr->nosuid ? "nosuid" : "suid",
+ hr->nodev ? "nodev" : "dev",
+ hr->noexec ? "noexec" : "exec");
+
+ if (hr->skeleton_directory)
+ printf(" Skel. Dir.: %s\n", user_record_skeleton_directory(hr));
+
+ if (hr->disk_size != UINT64_MAX)
+ printf(" Disk Size: %s\n", FORMAT_BYTES(hr->disk_size));
+
+ if (hr->disk_usage != UINT64_MAX) {
+ if (hr->disk_size != UINT64_MAX) {
+ unsigned permille;
+
+ permille = (unsigned) DIV_ROUND_UP(hr->disk_usage * 1000U, hr->disk_size); /* Round up! */
+ printf(" Disk Usage: %s (= %u.%01u%%)\n",
+ FORMAT_BYTES(hr->disk_usage),
+ permille / 10, permille % 10);
+ } else
+ printf(" Disk Usage: %s\n", FORMAT_BYTES(hr->disk_usage));
+ }
+
+ if (hr->disk_free != UINT64_MAX) {
+ if (hr->disk_size != UINT64_MAX) {
+ const char *color_on, *color_off;
+ unsigned permille;
+
+ permille = (unsigned) ((hr->disk_free * 1000U) / hr->disk_size); /* Round down! */
+
+ /* Color the output red or yellow if we are below 10% resp. 25% free. Because 10% and
+ * 25% can be a lot of space still, let's additionally make some absolute
+ * restrictions: 1G and 2G */
+ if (permille <= 100U &&
+ hr->disk_free < 1024U*1024U*1024U /* 1G */) {
+ color_on = ansi_highlight_red();
+ color_off = ansi_normal();
+ } else if (permille <= 250U &&
+ hr->disk_free < 2U*1024U*1024U*1024U /* 2G */) {
+ color_on = ansi_highlight_yellow();
+ color_off = ansi_normal();
+ } else
+ color_on = color_off = "";
+
+ printf(" Disk Free: %s%s (= %u.%01u%%)%s\n",
+ color_on,
+ FORMAT_BYTES(hr->disk_free),
+ permille / 10, permille % 10,
+ color_off);
+ } else
+ printf(" Disk Free: %s\n", FORMAT_BYTES(hr->disk_free));
+ }
+
+ if (hr->disk_floor != UINT64_MAX)
+ printf(" Disk Floor: %s\n", FORMAT_BYTES(hr->disk_floor));
+
+ if (hr->disk_ceiling != UINT64_MAX)
+ printf("Disk Ceiling: %s\n", FORMAT_BYTES(hr->disk_ceiling));
+
+ if (hr->good_authentication_counter != UINT64_MAX)
+ printf(" Good Auth.: %" PRIu64 "\n", hr->good_authentication_counter);
+
+ if (hr->last_good_authentication_usec != UINT64_MAX)
+ printf(" Last Good: %s\n", FORMAT_TIMESTAMP(hr->last_good_authentication_usec));
+
+ if (hr->bad_authentication_counter != UINT64_MAX)
+ printf(" Bad Auth.: %" PRIu64 "\n", hr->bad_authentication_counter);
+
+ if (hr->last_bad_authentication_usec != UINT64_MAX)
+ printf(" Last Bad: %s\n", FORMAT_TIMESTAMP(hr->last_bad_authentication_usec));
+
+ t = user_record_ratelimit_next_try(hr);
+ if (t != USEC_INFINITY) {
+ usec_t n = now(CLOCK_REALTIME);
+
+ if (t <= n)
+ printf(" Next Try: anytime\n");
+ else
+ printf(" Next Try: %sin %s%s\n",
+ ansi_highlight_red(),
+ FORMAT_TIMESPAN(t - n, USEC_PER_SEC),
+ ansi_normal());
+ }
+
+ if (storage != USER_CLASSIC)
+ printf(" Auth. Limit: %" PRIu64 " attempts per %s\n", user_record_ratelimit_burst(hr),
+ FORMAT_TIMESPAN(user_record_ratelimit_interval_usec(hr), 0));
+
+ if (hr->enforce_password_policy >= 0)
+ printf(" Passwd Pol.: %s\n", yes_no(hr->enforce_password_policy));
+
+ if (hr->password_change_min_usec != UINT64_MAX ||
+ hr->password_change_max_usec != UINT64_MAX ||
+ hr->password_change_warn_usec != UINT64_MAX ||
+ hr->password_change_inactive_usec != UINT64_MAX) {
+
+ printf(" Passwd Chg.:");
+
+ if (hr->password_change_min_usec != UINT64_MAX) {
+ printf(" min %s", FORMAT_TIMESPAN(hr->password_change_min_usec, 0));
+
+ if (hr->password_change_max_usec != UINT64_MAX)
+ printf(" …");
+ }
+
+ if (hr->password_change_max_usec != UINT64_MAX)
+ printf(" max %s", FORMAT_TIMESPAN(hr->password_change_max_usec, 0));
+
+ if (hr->password_change_warn_usec != UINT64_MAX)
+ printf("/warn %s", FORMAT_TIMESPAN(hr->password_change_warn_usec, 0));
+
+ if (hr->password_change_inactive_usec != UINT64_MAX)
+ printf("/inactive %s", FORMAT_TIMESPAN(hr->password_change_inactive_usec, 0));
+
+ printf("\n");
+ }
+
+ if (hr->password_change_now >= 0)
+ printf("Pas. Ch. Now: %s\n", yes_no(hr->password_change_now));
+
+ if (hr->drop_caches >= 0 || user_record_drop_caches(hr))
+ printf(" Drop Caches: %s\n", yes_no(user_record_drop_caches(hr)));
+
+ if (hr->auto_resize_mode >= 0)
+ printf(" Auto Resize: %s\n", auto_resize_mode_to_string(user_record_auto_resize_mode(hr)));
+
+ if (hr->rebalance_weight != REBALANCE_WEIGHT_UNSET) {
+ uint64_t rb;
+
+ rb = user_record_rebalance_weight(hr);
+ if (rb == REBALANCE_WEIGHT_OFF)
+ printf(" Rebalance: off\n");
+ else
+ printf(" Rebalance: weight %" PRIu64 "\n", rb);
+ }
+
+ if (!strv_isempty(hr->ssh_authorized_keys))
+ printf("SSH Pub. Key: %zu\n", strv_length(hr->ssh_authorized_keys));
+
+ if (!strv_isempty(hr->pkcs11_token_uri))
+ STRV_FOREACH(i, hr->pkcs11_token_uri)
+ printf(i == hr->pkcs11_token_uri ?
+ "PKCS11 Token: %s\n" :
+ " %s\n", *i);
+
+ if (hr->n_fido2_hmac_credential > 0)
+ printf(" FIDO2 Token: %zu\n", hr->n_fido2_hmac_credential);
+
+ if (!strv_isempty(hr->recovery_key_type))
+ printf("Recovery Key: %zu\n", strv_length(hr->recovery_key_type));
+
+ k = strv_length(hr->hashed_password);
+ if (k == 0)
+ printf(" Passwords: %snone%s\n",
+ user_record_disposition(hr) == USER_REGULAR ? ansi_highlight_yellow() : ansi_normal(), ansi_normal());
+ else
+ printf(" Passwords: %zu\n", k);
+
+ if (hr->signed_locally >= 0)
+ printf(" Local Sig.: %s\n", yes_no(hr->signed_locally));
+
+ if (hr->stop_delay_usec != UINT64_MAX)
+ printf(" Stop Delay: %s\n", FORMAT_TIMESPAN(hr->stop_delay_usec, 0));
+
+ if (hr->auto_login >= 0)
+ printf("Autom. Login: %s\n", yes_no(hr->auto_login));
+
+ if (hr->kill_processes >= 0)
+ printf(" Kill Proc.: %s\n", yes_no(hr->kill_processes));
+
+ if (hr->service)
+ printf(" Service: %s\n", hr->service);
+}
+
+void group_record_show(GroupRecord *gr, bool show_full_user_info) {
+ int r;
+
+ printf(" Group name: %s\n",
+ group_record_group_name_and_realm(gr));
+
+ printf(" Disposition: %s\n", user_disposition_to_string(group_record_disposition(gr)));
+
+ if (gr->last_change_usec != USEC_INFINITY)
+ printf(" Last Change: %s\n", FORMAT_TIMESTAMP(gr->last_change_usec));
+
+ if (gid_is_valid(gr->gid))
+ printf(" GID: " GID_FMT "\n", gr->gid);
+
+ if (show_full_user_info) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+
+ r = membershipdb_by_group(gr->group_name, 0, &iterator);
+ if (r < 0) {
+ errno = -r;
+ printf(" Members: (can't acquire: %m)");
+ } else {
+ const char *prefix = " Members:";
+
+ for (;;) {
+ _cleanup_free_ char *user = NULL;
+
+ r = membershipdb_iterator_get(iterator, &user, NULL);
+ if (r == -ESRCH)
+ break;
+ if (r < 0) {
+ errno = -r;
+ printf("%s (can't iterate: %m\n", prefix);
+ break;
+ }
+
+ printf("%s %s\n", prefix, user);
+ prefix = " ";
+ }
+ }
+ } else {
+ const char *prefix = " Members:";
+
+ STRV_FOREACH(i, gr->members) {
+ printf("%s %s\n", prefix, *i);
+ prefix = " ";
+ }
+ }
+
+ if (!strv_isempty(gr->administrators)) {
+ const char *prefix = " Admins:";
+
+ STRV_FOREACH(i, gr->administrators) {
+ printf("%s %s\n", prefix, *i);
+ prefix = " ";
+ }
+ }
+
+ if (gr->description && !streq(gr->description, gr->group_name))
+ printf(" Description: %s\n", gr->description);
+
+ if (!strv_isempty(gr->hashed_password))
+ printf(" Passwords: %zu\n", strv_length(gr->hashed_password));
+
+ if (gr->service)
+ printf(" Service: %s\n", gr->service);
+}
diff --git a/src/shared/user-record-show.h b/src/shared/user-record-show.h
new file mode 100644
index 0000000..dcef065
--- /dev/null
+++ b/src/shared/user-record-show.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "user-record.h"
+#include "group-record.h"
+
+const char *user_record_state_color(const char *state);
+
+void user_record_show(UserRecord *hr, bool show_full_group_info);
+void group_record_show(GroupRecord *gr, bool show_full_user_info);
diff --git a/src/shared/user-record.c b/src/shared/user-record.c
new file mode 100644
index 0000000..3fe3e80
--- /dev/null
+++ b/src/shared/user-record.c
@@ -0,0 +1,2319 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/mount.h>
+
+#include "cap-list.h"
+#include "cgroup-util.h"
+#include "dns-domain.h"
+#include "env-util.h"
+#include "fs-util.h"
+#include "glyph-util.h"
+#include "hexdecoct.h"
+#include "hostname-util.h"
+#include "memory-util.h"
+#include "path-util.h"
+#include "pkcs11-util.h"
+#include "rlimit-util.h"
+#include "string-table.h"
+#include "strv.h"
+#include "uid-alloc-range.h"
+#include "user-record.h"
+#include "user-util.h"
+
+#define DEFAULT_RATELIMIT_BURST 30
+#define DEFAULT_RATELIMIT_INTERVAL_USEC (1*USEC_PER_MINUTE)
+
+UserRecord* user_record_new(void) {
+ UserRecord *h;
+
+ h = new(UserRecord, 1);
+ if (!h)
+ return NULL;
+
+ *h = (UserRecord) {
+ .n_ref = 1,
+ .disposition = _USER_DISPOSITION_INVALID,
+ .last_change_usec = UINT64_MAX,
+ .last_password_change_usec = UINT64_MAX,
+ .umask = MODE_INVALID,
+ .nice_level = INT_MAX,
+ .not_before_usec = UINT64_MAX,
+ .not_after_usec = UINT64_MAX,
+ .locked = -1,
+ .storage = _USER_STORAGE_INVALID,
+ .access_mode = MODE_INVALID,
+ .disk_size = UINT64_MAX,
+ .disk_size_relative = UINT64_MAX,
+ .tasks_max = UINT64_MAX,
+ .memory_high = UINT64_MAX,
+ .memory_max = UINT64_MAX,
+ .cpu_weight = UINT64_MAX,
+ .io_weight = UINT64_MAX,
+ .uid = UID_INVALID,
+ .gid = GID_INVALID,
+ .nodev = true,
+ .nosuid = true,
+ .luks_discard = -1,
+ .luks_offline_discard = -1,
+ .luks_volume_key_size = UINT64_MAX,
+ .luks_pbkdf_force_iterations = UINT64_MAX,
+ .luks_pbkdf_time_cost_usec = UINT64_MAX,
+ .luks_pbkdf_memory_cost = UINT64_MAX,
+ .luks_pbkdf_parallel_threads = UINT64_MAX,
+ .luks_sector_size = UINT64_MAX,
+ .disk_usage = UINT64_MAX,
+ .disk_free = UINT64_MAX,
+ .disk_ceiling = UINT64_MAX,
+ .disk_floor = UINT64_MAX,
+ .signed_locally = -1,
+ .good_authentication_counter = UINT64_MAX,
+ .bad_authentication_counter = UINT64_MAX,
+ .last_good_authentication_usec = UINT64_MAX,
+ .last_bad_authentication_usec = UINT64_MAX,
+ .ratelimit_begin_usec = UINT64_MAX,
+ .ratelimit_count = UINT64_MAX,
+ .ratelimit_interval_usec = UINT64_MAX,
+ .ratelimit_burst = UINT64_MAX,
+ .removable = -1,
+ .enforce_password_policy = -1,
+ .auto_login = -1,
+ .stop_delay_usec = UINT64_MAX,
+ .kill_processes = -1,
+ .password_change_min_usec = UINT64_MAX,
+ .password_change_max_usec = UINT64_MAX,
+ .password_change_warn_usec = UINT64_MAX,
+ .password_change_inactive_usec = UINT64_MAX,
+ .password_change_now = -1,
+ .pkcs11_protected_authentication_path_permitted = -1,
+ .fido2_user_presence_permitted = -1,
+ .fido2_user_verification_permitted = -1,
+ .drop_caches = -1,
+ .auto_resize_mode = _AUTO_RESIZE_MODE_INVALID,
+ .rebalance_weight = REBALANCE_WEIGHT_UNSET,
+ };
+
+ return h;
+}
+
+static void pkcs11_encrypted_key_done(Pkcs11EncryptedKey *k) {
+ if (!k)
+ return;
+
+ free(k->uri);
+ erase_and_free(k->data);
+ erase_and_free(k->hashed_password);
+}
+
+static void fido2_hmac_credential_done(Fido2HmacCredential *c) {
+ if (!c)
+ return;
+
+ free(c->id);
+}
+
+static void fido2_hmac_salt_done(Fido2HmacSalt *s) {
+ if (!s)
+ return;
+
+ fido2_hmac_credential_done(&s->credential);
+ erase_and_free(s->salt);
+ erase_and_free(s->hashed_password);
+}
+
+static void recovery_key_done(RecoveryKey *k) {
+ if (!k)
+ return;
+
+ free(k->type);
+ erase_and_free(k->hashed_password);
+}
+
+static UserRecord* user_record_free(UserRecord *h) {
+ if (!h)
+ return NULL;
+
+ free(h->user_name);
+ free(h->realm);
+ free(h->user_name_and_realm_auto);
+ free(h->real_name);
+ free(h->email_address);
+ erase_and_free(h->password_hint);
+ free(h->location);
+ free(h->icon_name);
+
+ free(h->shell);
+
+ strv_free(h->environment);
+ free(h->time_zone);
+ free(h->preferred_language);
+ rlimit_free_all(h->rlimits);
+
+ free(h->skeleton_directory);
+
+ strv_free_erase(h->hashed_password);
+ strv_free_erase(h->ssh_authorized_keys);
+ strv_free_erase(h->password);
+ strv_free_erase(h->token_pin);
+
+ free(h->cifs_service);
+ free(h->cifs_user_name);
+ free(h->cifs_domain);
+ free(h->cifs_extra_mount_options);
+
+ free(h->image_path);
+ free(h->image_path_auto);
+ free(h->home_directory);
+ free(h->home_directory_auto);
+
+ strv_free(h->member_of);
+ strv_free(h->capability_bounding_set);
+ strv_free(h->capability_ambient_set);
+
+ free(h->file_system_type);
+ free(h->luks_cipher);
+ free(h->luks_cipher_mode);
+ free(h->luks_pbkdf_hash_algorithm);
+ free(h->luks_pbkdf_type);
+ free(h->luks_extra_mount_options);
+
+ free(h->state);
+ free(h->service);
+
+ strv_free(h->pkcs11_token_uri);
+ for (size_t i = 0; i < h->n_pkcs11_encrypted_key; i++)
+ pkcs11_encrypted_key_done(h->pkcs11_encrypted_key + i);
+ free(h->pkcs11_encrypted_key);
+
+ for (size_t i = 0; i < h->n_fido2_hmac_credential; i++)
+ fido2_hmac_credential_done(h->fido2_hmac_credential + i);
+ for (size_t i = 0; i < h->n_fido2_hmac_salt; i++)
+ fido2_hmac_salt_done(h->fido2_hmac_salt + i);
+
+ strv_free(h->recovery_key_type);
+ for (size_t i = 0; i < h->n_recovery_key; i++)
+ recovery_key_done(h->recovery_key + i);
+
+ json_variant_unref(h->json);
+
+ return mfree(h);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(UserRecord, user_record, user_record_free);
+
+int json_dispatch_realm(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ r = dns_name_is_valid(n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to check if JSON field '%s' is a valid DNS domain.", strna(name));
+ if (r == 0)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid DNS domain.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+int json_dispatch_gecos(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (valid_gecos(n)) {
+ if (free_and_strdup(s, n) < 0)
+ return json_log_oom(variant, flags);
+ } else {
+ _cleanup_free_ char *m = NULL;
+
+ json_log(variant, flags|JSON_DEBUG, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid GECOS compatible string, mangling.", strna(name));
+
+ m = mangle_gecos(n);
+ if (!m)
+ return json_log_oom(variant, flags);
+
+ free_and_replace(*s, m);
+ }
+
+ return 0;
+}
+
+static int json_dispatch_nice(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ int *nl = userdata;
+ int64_t m;
+
+ if (json_variant_is_null(variant)) {
+ *nl = INT_MAX;
+ return 0;
+ }
+
+ if (!json_variant_is_integer(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ m = json_variant_integer(variant);
+ if (m < PRIO_MIN || m >= PRIO_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "JSON field '%s' is not a valid nice level.", strna(name));
+
+ *nl = m;
+ return 0;
+}
+
+static int json_dispatch_rlimit_value(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ rlim_t *ret = userdata;
+
+ if (json_variant_is_null(variant))
+ *ret = RLIM_INFINITY;
+ else if (json_variant_is_unsigned(variant)) {
+ uint64_t w;
+
+ w = json_variant_unsigned(variant);
+ if (w == RLIM_INFINITY || (uint64_t) w != json_variant_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), "Resource limit value '%s' is out of range.", name);
+
+ *ret = (rlim_t) w;
+ } else
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit value '%s' is not an unsigned integer.", name);
+
+ return 0;
+}
+
+static int json_dispatch_rlimits(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ struct rlimit** limits = userdata;
+ JsonVariant *value;
+ const char *key;
+ int r;
+
+ assert_se(limits);
+
+ if (json_variant_is_null(variant)) {
+ rlimit_free_all(limits);
+ return 0;
+ }
+
+ if (!json_variant_is_object(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name));
+
+ JSON_VARIANT_OBJECT_FOREACH(key, value, variant) {
+ JsonVariant *jcur, *jmax;
+ struct rlimit rl;
+ const char *p;
+ int l;
+
+ p = startswith(key, "RLIMIT_");
+ if (!p)
+ l = -SYNTHETIC_ERRNO(EINVAL);
+ else
+ l = rlimit_from_string(p);
+ if (l < 0)
+ return json_log(variant, flags, l, "Resource limit '%s' not known.", key);
+
+ if (!json_variant_is_object(value))
+ return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' has invalid value.", key);
+
+ if (json_variant_elements(value) != 4)
+ return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' value is does not have two fields as expected.", key);
+
+ jcur = json_variant_by_key(value, "cur");
+ if (!jcur)
+ return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' lacks 'cur' field.", key);
+ r = json_dispatch_rlimit_value("cur", jcur, flags, &rl.rlim_cur);
+ if (r < 0)
+ return r;
+
+ jmax = json_variant_by_key(value, "max");
+ if (!jmax)
+ return json_log(value, flags, SYNTHETIC_ERRNO(EINVAL), "Resource limit '%s' lacks 'max' field.", key);
+ r = json_dispatch_rlimit_value("max", jmax, flags, &rl.rlim_max);
+ if (r < 0)
+ return r;
+
+ if (limits[l])
+ *(limits[l]) = rl;
+ else {
+ limits[l] = newdup(struct rlimit, &rl, 1);
+ if (!limits[l])
+ return log_oom();
+ }
+ }
+
+ return 0;
+}
+
+static int json_dispatch_filename_or_path(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = ASSERT_PTR(userdata);
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (!filename_is_valid(n) && !path_is_normalized(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid file name or normalized path.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+static int json_dispatch_path(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (!path_is_normalized(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a normalized file system path.", strna(name));
+ if (!path_is_absolute(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an absolute file system path.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+static int json_dispatch_home_directory(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (!valid_home(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid home directory path.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+static int json_dispatch_image_path(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (empty_or_root(n) || !path_is_valid(n) || !path_is_absolute(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid image path.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+static int json_dispatch_umask(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ mode_t *m = userdata;
+ uint64_t k;
+
+ if (json_variant_is_null(variant)) {
+ *m = MODE_INVALID;
+ return 0;
+ }
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a number.", strna(name));
+
+ k = json_variant_unsigned(variant);
+ if (k > 0777)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL),
+ "JSON field '%s' outside of valid range 0%s0777.",
+ strna(name), special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ *m = (mode_t) k;
+ return 0;
+}
+
+static int json_dispatch_access_mode(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ mode_t *m = userdata;
+ uint64_t k;
+
+ if (json_variant_is_null(variant)) {
+ *m = MODE_INVALID;
+ return 0;
+ }
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a number.", strna(name));
+
+ k = json_variant_unsigned(variant);
+ if (k > 07777)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL),
+ "JSON field '%s' outside of valid range 0%s07777.",
+ strna(name), special_glyph(SPECIAL_GLYPH_ELLIPSIS));
+
+ *m = (mode_t) k;
+ return 0;
+}
+
+static int json_dispatch_environment(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ _cleanup_strv_free_ char **n = NULL;
+ char ***l = userdata;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *l = strv_free(*l);
+ return 0;
+ }
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+ for (size_t i = 0; i < json_variant_elements(variant); i++) {
+ JsonVariant *e;
+ const char *a;
+
+ e = json_variant_by_index(variant, i);
+ if (!json_variant_is_string(e))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of strings.", strna(name));
+
+ assert_se(a = json_variant_string(e));
+
+ if (!env_assignment_is_valid(a))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of environment variables.", strna(name));
+
+ r = strv_env_replace_strdup(&n, a);
+ if (r < 0)
+ return json_log_oom(variant, flags);
+ }
+
+ return strv_free_and_replace(*l, n);
+}
+
+int json_dispatch_user_disposition(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserDisposition *disposition = userdata, k;
+
+ if (json_variant_is_null(variant)) {
+ *disposition = _USER_DISPOSITION_INVALID;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ k = user_disposition_from_string(json_variant_string(variant));
+ if (k < 0)
+ return json_log(variant, flags, k, "Disposition type '%s' not known.", json_variant_string(variant));
+
+ *disposition = k;
+ return 0;
+}
+
+static int json_dispatch_storage(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserStorage *storage = userdata, k;
+
+ if (json_variant_is_null(variant)) {
+ *storage = _USER_STORAGE_INVALID;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ k = user_storage_from_string(json_variant_string(variant));
+ if (k < 0)
+ return json_log(variant, flags, k, "Storage type '%s' not known.", json_variant_string(variant));
+
+ *storage = k;
+ return 0;
+}
+
+static int json_dispatch_tasks_or_memory_max(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint64_t *limit = userdata, k;
+
+ if (json_variant_is_null(variant)) {
+ *limit = UINT64_MAX;
+ return 0;
+ }
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name));
+
+ k = json_variant_unsigned(variant);
+ if (k <= 0 || k >= UINT64_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE),
+ "JSON field '%s' is not in valid range %" PRIu64 "%s%" PRIu64 ".",
+ strna(name), (uint64_t) 1, special_glyph(SPECIAL_GLYPH_ELLIPSIS), UINT64_MAX-1);
+
+ *limit = k;
+ return 0;
+}
+
+static int json_dispatch_weight(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint64_t *weight = userdata, k;
+
+ if (json_variant_is_null(variant)) {
+ *weight = UINT64_MAX;
+ return 0;
+ }
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an integer.", strna(name));
+
+ k = json_variant_unsigned(variant);
+ if (k <= CGROUP_WEIGHT_MIN || k >= CGROUP_WEIGHT_MAX)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE),
+ "JSON field '%s' is not in valid range %" PRIu64 "%s%" PRIu64 ".",
+ strna(name), (uint64_t) CGROUP_WEIGHT_MIN,
+ special_glyph(SPECIAL_GLYPH_ELLIPSIS), (uint64_t) CGROUP_WEIGHT_MAX);
+
+ *weight = k;
+ return 0;
+}
+
+int json_dispatch_user_group_list(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ _cleanup_strv_free_ char **l = NULL;
+ char ***list = userdata;
+ JsonVariant *e;
+ int r;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of strings.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+
+ if (!json_variant_is_string(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string.");
+
+ if (!valid_user_group_name(json_variant_string(e), FLAGS_SET(flags, JSON_RELAX) ? VALID_USER_RELAX : 0))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a valid user/group name: %s", json_variant_string(e));
+
+ r = strv_extend(&l, json_variant_string(e));
+ if (r < 0)
+ return json_log(e, flags, r, "Failed to append array element: %m");
+ }
+
+ r = strv_extend_strv(list, l, true);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to merge user/group arrays: %m");
+
+ return 0;
+}
+
+static int dispatch_secret(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch secret_dispatch_table[] = {
+ { "password", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, password), 0 },
+ { "tokenPin", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, token_pin), 0 },
+ { "pkcs11Pin", /* legacy alias */ _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, token_pin), 0 },
+ { "pkcs11ProtectedAuthenticationPathPermitted", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, pkcs11_protected_authentication_path_permitted), 0 },
+ { "fido2UserPresencePermitted", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, fido2_user_presence_permitted), 0 },
+ { "fido2UserVerificationPermitted", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, fido2_user_verification_permitted), 0 },
+ {},
+ };
+
+ return json_dispatch(variant, secret_dispatch_table, flags, userdata);
+}
+
+static int dispatch_pkcs11_uri(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ char **s = userdata;
+ const char *n;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *s = mfree(*s);
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ n = json_variant_string(variant);
+ if (!pkcs11_uri_valid(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid RFC7512 PKCS#11 URI.", strna(name));
+
+ r = free_and_strdup(s, n);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to allocate string: %m");
+
+ return 0;
+}
+
+static int dispatch_pkcs11_uri_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ _cleanup_strv_free_ char **z = NULL;
+ char ***l = userdata;
+ JsonVariant *e;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ *l = strv_free(*l);
+ return 0;
+ }
+
+ if (json_variant_is_string(variant)) {
+ const char *n;
+
+ n = json_variant_string(variant);
+ if (!pkcs11_uri_valid(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid RFC7512 PKCS#11 URI.", strna(name));
+
+ z = strv_new(n);
+ if (!z)
+ return log_oom();
+
+ } else {
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string or array of strings.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ const char *n;
+
+ if (!json_variant_is_string(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string.");
+
+ n = json_variant_string(e);
+ if (!pkcs11_uri_valid(n))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element in '%s' is not a valid RFC7512 PKCS#11 URI: %s", strna(name), n);
+
+ r = strv_extend(&z, n);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ strv_free_and_replace(*l, z);
+ return 0;
+}
+
+static int dispatch_pkcs11_key_data(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ Pkcs11EncryptedKey *k = userdata;
+ size_t l;
+ void *b;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ k->data = erase_and_free(k->data);
+ k->size = 0;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ r = unbase64mem(json_variant_string(variant), SIZE_MAX, &b, &l);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to decode encrypted PKCS#11 key: %m");
+
+ erase_and_free(k->data);
+ k->data = b;
+ k->size = l;
+
+ return 0;
+}
+
+static int dispatch_pkcs11_key(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserRecord *h = userdata;
+ JsonVariant *e;
+ int r;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ Pkcs11EncryptedKey *array, *k;
+
+ static const JsonDispatch pkcs11_key_dispatch_table[] = {
+ { "uri", JSON_VARIANT_STRING, dispatch_pkcs11_uri, offsetof(Pkcs11EncryptedKey, uri), JSON_MANDATORY },
+ { "data", JSON_VARIANT_STRING, dispatch_pkcs11_key_data, 0, JSON_MANDATORY },
+ { "hashedPassword", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Pkcs11EncryptedKey, hashed_password), JSON_MANDATORY },
+ {},
+ };
+
+ if (!json_variant_is_object(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an object.");
+
+ array = reallocarray(h->pkcs11_encrypted_key, h->n_pkcs11_encrypted_key + 1, sizeof(Pkcs11EncryptedKey));
+ if (!array)
+ return log_oom();
+
+ h->pkcs11_encrypted_key = array;
+ k = h->pkcs11_encrypted_key + h->n_pkcs11_encrypted_key;
+ *k = (Pkcs11EncryptedKey) {};
+
+ r = json_dispatch(e, pkcs11_key_dispatch_table, flags, k);
+ if (r < 0) {
+ pkcs11_encrypted_key_done(k);
+ return r;
+ }
+
+ h->n_pkcs11_encrypted_key++;
+ }
+
+ return 0;
+}
+
+static int dispatch_fido2_hmac_credential(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ Fido2HmacCredential *k = userdata;
+ size_t l;
+ void *b;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ k->id = mfree(k->id);
+ k->size = 0;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ r = unbase64mem(json_variant_string(variant), SIZE_MAX, &b, &l);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to decode FIDO2 credential ID: %m");
+
+ free_and_replace(k->id, b);
+ k->size = l;
+
+ return 0;
+}
+
+static int dispatch_fido2_hmac_credential_array(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserRecord *h = userdata;
+ JsonVariant *e;
+ int r;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of strings.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ Fido2HmacCredential *array;
+ size_t l;
+ void *b;
+
+ if (!json_variant_is_string(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not a string.");
+
+ array = reallocarray(h->fido2_hmac_credential, h->n_fido2_hmac_credential + 1, sizeof(Fido2HmacCredential));
+ if (!array)
+ return log_oom();
+
+ r = unbase64mem(json_variant_string(e), SIZE_MAX, &b, &l);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to decode FIDO2 credential ID: %m");
+
+ h->fido2_hmac_credential = array;
+
+ h->fido2_hmac_credential[h->n_fido2_hmac_credential++] = (Fido2HmacCredential) {
+ .id = b,
+ .size = l,
+ };
+ }
+
+ return 0;
+}
+
+static int dispatch_fido2_hmac_salt_value(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ Fido2HmacSalt *k = userdata;
+ size_t l;
+ void *b;
+ int r;
+
+ if (json_variant_is_null(variant)) {
+ k->salt = erase_and_free(k->salt);
+ k->salt_size = 0;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string.", strna(name));
+
+ r = unbase64mem(json_variant_string(variant), SIZE_MAX, &b, &l);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to decode FIDO2 salt: %m");
+
+ erase_and_free(k->salt);
+ k->salt = b;
+ k->salt_size = l;
+
+ return 0;
+}
+
+static int dispatch_fido2_hmac_salt(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserRecord *h = userdata;
+ JsonVariant *e;
+ int r;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ Fido2HmacSalt *array, *k;
+
+ static const JsonDispatch fido2_hmac_salt_dispatch_table[] = {
+ { "credential", JSON_VARIANT_STRING, dispatch_fido2_hmac_credential, offsetof(Fido2HmacSalt, credential), JSON_MANDATORY },
+ { "salt", JSON_VARIANT_STRING, dispatch_fido2_hmac_salt_value, 0, JSON_MANDATORY },
+ { "hashedPassword", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Fido2HmacSalt, hashed_password), JSON_MANDATORY },
+ { "up", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Fido2HmacSalt, up), 0 },
+ { "uv", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Fido2HmacSalt, uv), 0 },
+ { "clientPin", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(Fido2HmacSalt, client_pin), 0 },
+ {},
+ };
+
+ if (!json_variant_is_object(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an object.");
+
+ array = reallocarray(h->fido2_hmac_salt, h->n_fido2_hmac_salt + 1, sizeof(Fido2HmacSalt));
+ if (!array)
+ return log_oom();
+
+ h->fido2_hmac_salt = array;
+ k = h->fido2_hmac_salt + h->n_fido2_hmac_salt;
+ *k = (Fido2HmacSalt) {
+ .uv = -1,
+ .up = -1,
+ .client_pin = -1,
+ };
+
+ r = json_dispatch(e, fido2_hmac_salt_dispatch_table, flags, k);
+ if (r < 0) {
+ fido2_hmac_salt_done(k);
+ return r;
+ }
+
+ h->n_fido2_hmac_salt++;
+ }
+
+ return 0;
+}
+
+static int dispatch_recovery_key(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ UserRecord *h = userdata;
+ JsonVariant *e;
+ int r;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ RecoveryKey *array, *k;
+
+ static const JsonDispatch recovery_key_dispatch_table[] = {
+ { "type", JSON_VARIANT_STRING, json_dispatch_string, 0, JSON_MANDATORY },
+ { "hashedPassword", JSON_VARIANT_STRING, json_dispatch_string, offsetof(RecoveryKey, hashed_password), JSON_MANDATORY },
+ {},
+ };
+
+ if (!json_variant_is_object(e))
+ return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), "JSON array element is not an object.");
+
+ array = reallocarray(h->recovery_key, h->n_recovery_key + 1, sizeof(RecoveryKey));
+ if (!array)
+ return log_oom();
+
+ h->recovery_key = array;
+ k = h->recovery_key + h->n_recovery_key;
+ *k = (RecoveryKey) {};
+
+ r = json_dispatch(e, recovery_key_dispatch_table, flags, k);
+ if (r < 0) {
+ recovery_key_done(k);
+ return r;
+ }
+
+ h->n_recovery_key++;
+ }
+
+ return 0;
+}
+
+static int dispatch_auto_resize_mode(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ AutoResizeMode *mode = userdata, m;
+
+ assert_se(mode);
+
+ if (json_variant_is_null(variant)) {
+ *mode = _AUTO_RESIZE_MODE_INVALID;
+ return 0;
+ }
+
+ if (json_variant_is_boolean(variant)) {
+ *mode = json_variant_boolean(variant) ? AUTO_RESIZE_SHRINK_AND_GROW : AUTO_RESIZE_OFF;
+ return 0;
+ }
+
+ if (!json_variant_is_string(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a string, boolean or null.", strna(name));
+
+ m = auto_resize_mode_from_string(json_variant_string(variant));
+ if (m < 0)
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not a valid automatic resize mode.", strna(name));
+
+ *mode = m;
+ return 0;
+}
+
+static int dispatch_rebalance_weight(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+ uint64_t *rebalance_weight = userdata;
+ uintmax_t u;
+
+ assert_se(rebalance_weight);
+
+ if (json_variant_is_null(variant)) {
+ *rebalance_weight = REBALANCE_WEIGHT_UNSET;
+ return 0;
+ }
+
+ if (json_variant_is_boolean(variant)) {
+ *rebalance_weight = json_variant_boolean(variant) ? REBALANCE_WEIGHT_DEFAULT : REBALANCE_WEIGHT_OFF;
+ return 0;
+ }
+
+ if (!json_variant_is_unsigned(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an unsigned integer, boolean or null.", strna(name));
+
+ u = json_variant_unsigned(variant);
+ if (u >= REBALANCE_WEIGHT_MIN && u <= REBALANCE_WEIGHT_MAX)
+ *rebalance_weight = (uint64_t) u;
+ else if (u == 0)
+ *rebalance_weight = REBALANCE_WEIGHT_OFF;
+ else
+ return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE),
+ "Rebalance weight is out of valid range %" PRIu64 "%s%" PRIu64 ".",
+ REBALANCE_WEIGHT_MIN, special_glyph(SPECIAL_GLYPH_ELLIPSIS), REBALANCE_WEIGHT_MAX);
+
+ return 0;
+}
+
+static int dispatch_privileged(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch privileged_dispatch_table[] = {
+ { "passwordHint", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, password_hint), 0 },
+ { "hashedPassword", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, hashed_password), JSON_SAFE },
+ { "sshAuthorizedKeys", _JSON_VARIANT_TYPE_INVALID, json_dispatch_strv, offsetof(UserRecord, ssh_authorized_keys), 0 },
+ { "pkcs11EncryptedKey", JSON_VARIANT_ARRAY, dispatch_pkcs11_key, 0, 0 },
+ { "fido2HmacSalt", JSON_VARIANT_ARRAY, dispatch_fido2_hmac_salt, 0, 0 },
+ { "recoveryKey", JSON_VARIANT_ARRAY, dispatch_recovery_key, 0, 0 },
+ {},
+ };
+
+ return json_dispatch(variant, privileged_dispatch_table, flags, userdata);
+}
+
+static int dispatch_binding(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch binding_dispatch_table[] = {
+ { "imagePath", JSON_VARIANT_STRING, json_dispatch_image_path, offsetof(UserRecord, image_path), 0 },
+ { "homeDirectory", JSON_VARIANT_STRING, json_dispatch_home_directory, offsetof(UserRecord, home_directory), 0 },
+ { "partitionUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, partition_uuid), 0 },
+ { "luksUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, luks_uuid), 0 },
+ { "fileSystemUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, file_system_uuid), 0 },
+ { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, uid), 0 },
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, gid), 0 },
+ { "storage", JSON_VARIANT_STRING, json_dispatch_storage, offsetof(UserRecord, storage), 0 },
+ { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE },
+ { "luksCipher", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher), JSON_SAFE },
+ { "luksCipherMode", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher_mode), JSON_SAFE },
+ { "luksVolumeKeySize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_volume_key_size), 0 },
+ {},
+ };
+
+ JsonVariant *m;
+ sd_id128_t mid;
+ int r;
+
+ if (!variant)
+ return 0;
+
+ if (!json_variant_is_object(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name));
+
+ r = sd_id128_get_machine(&mid);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to determine machine ID: %m");
+
+ m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid));
+ if (!m)
+ return 0;
+
+ return json_dispatch(m, binding_dispatch_table, flags, userdata);
+}
+
+int per_machine_id_match(JsonVariant *ids, JsonDispatchFlags flags) {
+ sd_id128_t mid;
+ int r;
+
+ r = sd_id128_get_machine(&mid);
+ if (r < 0)
+ return json_log(ids, flags, r, "Failed to acquire machine ID: %m");
+
+ if (json_variant_is_string(ids)) {
+ sd_id128_t k;
+
+ r = sd_id128_from_string(json_variant_string(ids), &k);
+ if (r < 0) {
+ json_log(ids, flags, r, "%s is not a valid machine ID, ignoring: %m", json_variant_string(ids));
+ return 0;
+ }
+
+ return sd_id128_equal(mid, k);
+ }
+
+ if (json_variant_is_array(ids)) {
+ JsonVariant *e;
+
+ JSON_VARIANT_ARRAY_FOREACH(e, ids) {
+ sd_id128_t k;
+
+ if (!json_variant_is_string(e)) {
+ json_log(e, flags, 0, "Machine ID is not a string, ignoring: %m");
+ continue;
+ }
+
+ r = sd_id128_from_string(json_variant_string(e), &k);
+ if (r < 0) {
+ json_log(e, flags, r, "%s is not a valid machine ID, ignoring: %m", json_variant_string(e));
+ continue;
+ }
+
+ if (sd_id128_equal(mid, k))
+ return true;
+ }
+
+ return false;
+ }
+
+ json_log(ids, flags, 0, "Machine ID is not a string or array of strings, ignoring: %m");
+ return false;
+}
+
+int per_machine_hostname_match(JsonVariant *hns, JsonDispatchFlags flags) {
+ _cleanup_free_ char *hn = NULL;
+ int r;
+
+ r = gethostname_strict(&hn);
+ if (r == -ENXIO) {
+ json_log(hns, flags, r, "No hostname set, not matching perMachine hostname record: %m");
+ return false;
+ }
+ if (r < 0)
+ return json_log(hns, flags, r, "Failed to acquire hostname: %m");
+
+ if (json_variant_is_string(hns))
+ return streq(json_variant_string(hns), hn);
+
+ if (json_variant_is_array(hns)) {
+ JsonVariant *e;
+
+ JSON_VARIANT_ARRAY_FOREACH(e, hns) {
+
+ if (!json_variant_is_string(e)) {
+ json_log(e, flags, 0, "Hostname is not a string, ignoring: %m");
+ continue;
+ }
+
+ if (streq(json_variant_string(hns), hn))
+ return true;
+ }
+
+ return false;
+ }
+
+ json_log(hns, flags, 0, "Hostname is not a string or array of strings, ignoring: %m");
+ return false;
+}
+
+static int dispatch_per_machine(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch per_machine_dispatch_table[] = {
+ { "matchMachineId", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 },
+ { "matchHostname", _JSON_VARIANT_TYPE_INVALID, NULL, 0, 0 },
+ { "iconName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, icon_name), JSON_SAFE },
+ { "location", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, location), 0 },
+ { "shell", JSON_VARIANT_STRING, json_dispatch_filename_or_path, offsetof(UserRecord, shell), 0 },
+ { "umask", JSON_VARIANT_UNSIGNED, json_dispatch_umask, offsetof(UserRecord, umask), 0 },
+ { "environment", JSON_VARIANT_ARRAY, json_dispatch_environment, offsetof(UserRecord, environment), 0 },
+ { "timeZone", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, time_zone), JSON_SAFE },
+ { "preferredLanguage", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, preferred_language), JSON_SAFE },
+ { "niceLevel", _JSON_VARIANT_TYPE_INVALID, json_dispatch_nice, offsetof(UserRecord, nice_level), 0 },
+ { "resourceLimits", _JSON_VARIANT_TYPE_INVALID, json_dispatch_rlimits, offsetof(UserRecord, rlimits), 0 },
+ { "locked", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, locked), 0 },
+ { "notBeforeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_before_usec), 0 },
+ { "notAfterUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_after_usec), 0 },
+ { "storage", JSON_VARIANT_STRING, json_dispatch_storage, offsetof(UserRecord, storage), 0 },
+ { "diskSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size), 0 },
+ { "diskSizeRelative", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size_relative), 0 },
+ { "skeletonDirectory", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, skeleton_directory), 0 },
+ { "accessMode", JSON_VARIANT_UNSIGNED, json_dispatch_access_mode, offsetof(UserRecord, access_mode), 0 },
+ { "tasksMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, tasks_max), 0 },
+ { "memoryHigh", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_high), 0 },
+ { "memoryMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_max), 0 },
+ { "cpuWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, cpu_weight), 0 },
+ { "ioWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, io_weight), 0 },
+ { "mountNoDevices", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nodev), 0 },
+ { "mountNoSuid", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nosuid), 0 },
+ { "mountNoExecute", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, noexec), 0 },
+ { "cifsDomain", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_domain), JSON_SAFE },
+ { "cifsUserName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_user_name), JSON_SAFE },
+ { "cifsService", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_service), JSON_SAFE },
+ { "cifsExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_extra_mount_options), 0 },
+ { "imagePath", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, image_path), 0 },
+ { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, uid), 0 },
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, gid), 0 },
+ { "memberOf", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(UserRecord, member_of), JSON_RELAX},
+ { "capabilityBoundingSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_bounding_set), JSON_SAFE },
+ { "capabilityAmbientSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_ambient_set), JSON_SAFE },
+ { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE },
+ { "partitionUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, partition_uuid), 0 },
+ { "luksUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, luks_uuid), 0 },
+ { "fileSystemUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, file_system_uuid), 0 },
+ { "luksDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_discard), 0, },
+ { "luksOfflineDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_offline_discard), 0, },
+ { "luksCipher", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher), JSON_SAFE },
+ { "luksCipherMode", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher_mode), JSON_SAFE },
+ { "luksVolumeKeySize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_volume_key_size), 0 },
+ { "luksPbkdfHashAlgorithm", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_hash_algorithm), JSON_SAFE },
+ { "luksPbkdfType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_type), JSON_SAFE },
+ { "luksPbkdfForceIterations", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_force_iterations), 0 },
+ { "luksPbkdfTimeCostUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_time_cost_usec), 0 },
+ { "luksPbkdfMemoryCost", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_memory_cost), 0 },
+ { "luksPbkdfParallelThreads", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_parallel_threads), 0 },
+ { "luksSectorSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_sector_size), 0 },
+ { "luksExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_extra_mount_options), 0 },
+ { "dropCaches", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, drop_caches), 0 },
+ { "autoResizeMode", _JSON_VARIANT_TYPE_INVALID, dispatch_auto_resize_mode, offsetof(UserRecord, auto_resize_mode), 0 },
+ { "rebalanceWeight", _JSON_VARIANT_TYPE_INVALID, dispatch_rebalance_weight, offsetof(UserRecord, rebalance_weight), 0 },
+ { "rateLimitIntervalUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_interval_usec), 0 },
+ { "rateLimitBurst", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_burst), 0 },
+ { "enforcePasswordPolicy", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, enforce_password_policy), 0 },
+ { "autoLogin", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, auto_login), 0 },
+ { "stopDelayUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, stop_delay_usec), 0 },
+ { "killProcesses", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, kill_processes), 0 },
+ { "passwordChangeMinUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_min_usec), 0 },
+ { "passwordChangeMaxUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_max_usec), 0 },
+ { "passwordChangeWarnUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_warn_usec), 0 },
+ { "passwordChangeInactiveUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_inactive_usec), 0 },
+ { "passwordChangeNow", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, password_change_now), 0 },
+ { "pkcs11TokenUri", JSON_VARIANT_ARRAY, dispatch_pkcs11_uri_array, offsetof(UserRecord, pkcs11_token_uri), 0 },
+ { "fido2HmacCredential", JSON_VARIANT_ARRAY, dispatch_fido2_hmac_credential_array, 0, 0 },
+ {},
+ };
+
+ JsonVariant *e;
+ int r;
+
+ if (!variant)
+ return 0;
+
+ if (!json_variant_is_array(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array.", strna(name));
+
+ JSON_VARIANT_ARRAY_FOREACH(e, variant) {
+ bool matching = false;
+ JsonVariant *m;
+
+ if (!json_variant_is_object(e))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an array of objects.", strna(name));
+
+ m = json_variant_by_key(e, "matchMachineId");
+ if (m) {
+ r = per_machine_id_match(m, flags);
+ if (r < 0)
+ return r;
+
+ matching = r > 0;
+ }
+
+ if (!matching) {
+ m = json_variant_by_key(e, "matchHostname");
+ if (m) {
+ r = per_machine_hostname_match(m, flags);
+ if (r < 0)
+ return r;
+
+ matching = r > 0;
+ }
+ }
+
+ if (!matching)
+ continue;
+
+ r = json_dispatch(e, per_machine_dispatch_table, flags, userdata);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int dispatch_status(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) {
+
+ static const JsonDispatch status_dispatch_table[] = {
+ { "diskUsage", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_usage), 0 },
+ { "diskFree", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_free), 0 },
+ { "diskSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size), 0 },
+ { "diskCeiling", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_ceiling), 0 },
+ { "diskFloor", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_floor), 0 },
+ { "state", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, state), JSON_SAFE },
+ { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, service), JSON_SAFE },
+ { "signedLocally", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, signed_locally), 0 },
+ { "goodAuthenticationCounter", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, good_authentication_counter), 0 },
+ { "badAuthenticationCounter", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, bad_authentication_counter), 0 },
+ { "lastGoodAuthenticationUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_good_authentication_usec), 0 },
+ { "lastBadAuthenticationUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_bad_authentication_usec), 0 },
+ { "rateLimitBeginUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_begin_usec), 0 },
+ { "rateLimitCount", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_count), 0 },
+ { "removable", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, removable), 0 },
+ { "accessMode", JSON_VARIANT_UNSIGNED, json_dispatch_access_mode, offsetof(UserRecord, access_mode), 0 },
+ { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE },
+ {},
+ };
+
+ JsonVariant *m;
+ sd_id128_t mid;
+ int r;
+
+ if (!variant)
+ return 0;
+
+ if (!json_variant_is_object(variant))
+ return json_log(variant, flags, SYNTHETIC_ERRNO(EINVAL), "JSON field '%s' is not an object.", strna(name));
+
+ r = sd_id128_get_machine(&mid);
+ if (r < 0)
+ return json_log(variant, flags, r, "Failed to determine machine ID: %m");
+
+ m = json_variant_by_key(variant, SD_ID128_TO_STRING(mid));
+ if (!m)
+ return 0;
+
+ return json_dispatch(m, status_dispatch_table, flags, userdata);
+}
+
+int user_record_build_image_path(UserStorage storage, const char *user_name_and_realm, char **ret) {
+ const char *suffix;
+ char *z;
+
+ assert(storage >= 0);
+ assert(user_name_and_realm);
+ assert(ret);
+
+ if (storage == USER_LUKS)
+ suffix = ".home";
+ else if (IN_SET(storage, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT))
+ suffix = ".homedir";
+ else {
+ *ret = NULL;
+ return 0;
+ }
+
+ z = strjoin(get_home_root(), "/", user_name_and_realm, suffix);
+ if (!z)
+ return -ENOMEM;
+
+ *ret = path_simplify(z);
+ return 1;
+}
+
+static int user_record_augment(UserRecord *h, JsonDispatchFlags json_flags) {
+ int r;
+
+ assert(h);
+
+ if (!FLAGS_SET(h->mask, USER_RECORD_REGULAR))
+ return 0;
+
+ assert(h->user_name);
+
+ if (!h->user_name_and_realm_auto && h->realm) {
+ h->user_name_and_realm_auto = strjoin(h->user_name, "@", h->realm);
+ if (!h->user_name_and_realm_auto)
+ return json_log_oom(h->json, json_flags);
+ }
+
+ /* Let's add in the following automatisms only for regular users, they don't make sense for any others */
+ if (user_record_disposition(h) != USER_REGULAR)
+ return 0;
+
+ if (!h->home_directory && !h->home_directory_auto) {
+ h->home_directory_auto = path_join(get_home_root(), h->user_name);
+ if (!h->home_directory_auto)
+ return json_log_oom(h->json, json_flags);
+ }
+
+ if (!h->image_path && !h->image_path_auto) {
+ r = user_record_build_image_path(user_record_storage(h), user_record_user_name_and_realm(h), &h->image_path_auto);
+ if (r < 0)
+ return json_log(h->json, json_flags, r, "Failed to determine default image path: %m");
+ }
+
+ return 0;
+}
+
+int user_group_record_mangle(
+ JsonVariant *v,
+ UserRecordLoadFlags load_flags,
+ JsonVariant **ret_variant,
+ UserRecordMask *ret_mask) {
+
+ static const struct {
+ UserRecordMask mask;
+ const char *name;
+ } mask_field[] = {
+ { USER_RECORD_PRIVILEGED, "privileged" },
+ { USER_RECORD_SECRET, "secret" },
+ { USER_RECORD_BINDING, "binding" },
+ { USER_RECORD_PER_MACHINE, "perMachine" },
+ { USER_RECORD_STATUS, "status" },
+ { USER_RECORD_SIGNATURE, "signature" },
+ };
+
+ JsonDispatchFlags json_flags = USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(load_flags);
+ _cleanup_(json_variant_unrefp) JsonVariant *w = NULL;
+ JsonVariant *array[ELEMENTSOF(mask_field) * 2];
+ size_t n_retain = 0;
+ UserRecordMask m = 0;
+ int r;
+
+ assert((load_flags & _USER_RECORD_MASK_MAX) == 0); /* detect mistakes when accidentally passing
+ * UserRecordMask bit masks as UserRecordLoadFlags
+ * value */
+
+ assert(v);
+ assert(ret_variant);
+ assert(ret_mask);
+
+ /* Note that this function is shared with the group record parser, hence we try to be generic in our
+ * log message wording here, to cover both cases. */
+
+ if (!json_variant_is_object(v))
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record is not a JSON object, refusing.");
+
+ if (USER_RECORD_ALLOW_MASK(load_flags) == 0) /* allow nothing? */
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EINVAL), "Nothing allowed in record, refusing.");
+
+ if (USER_RECORD_STRIP_MASK(load_flags) == _USER_RECORD_MASK_MAX) /* strip everything? */
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EINVAL), "Stripping everything from record, refusing.");
+
+ /* Check if we have the special sections and if they match our flags set */
+ for (size_t i = 0; i < ELEMENTSOF(mask_field); i++) {
+ JsonVariant *e, *k;
+
+ if (FLAGS_SET(USER_RECORD_STRIP_MASK(load_flags), mask_field[i].mask)) {
+ if (!w)
+ w = json_variant_ref(v);
+
+ r = json_variant_filter(&w, STRV_MAKE(mask_field[i].name));
+ if (r < 0)
+ return json_log(w, json_flags, r, "Failed to remove field from variant: %m");
+
+ continue;
+ }
+
+ e = json_variant_by_key_full(v, mask_field[i].name, &k);
+ if (e) {
+ if (!FLAGS_SET(USER_RECORD_ALLOW_MASK(load_flags), mask_field[i].mask))
+ return json_log(e, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record contains '%s' field, which is not allowed.", mask_field[i].name);
+
+ if (FLAGS_SET(load_flags, USER_RECORD_STRIP_REGULAR)) {
+ array[n_retain++] = k;
+ array[n_retain++] = e;
+ }
+
+ m |= mask_field[i].mask;
+ } else {
+ if (FLAGS_SET(USER_RECORD_REQUIRE_MASK(load_flags), mask_field[i].mask))
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record lacks '%s' field, which is required.", mask_field[i].name);
+ }
+ }
+
+ if (FLAGS_SET(load_flags, USER_RECORD_STRIP_REGULAR)) {
+ /* If we are supposed to strip regular items, then let's instead just allocate a new object
+ * with just the stuff we need. */
+
+ w = json_variant_unref(w);
+ r = json_variant_new_object(&w, array, n_retain);
+ if (r < 0)
+ return json_log(v, json_flags, r, "Failed to allocate new object: %m");
+ } else
+ /* And now check if there's anything else in the record */
+ for (size_t i = 0; i < json_variant_elements(v); i += 2) {
+ const char *f;
+ bool special = false;
+
+ assert_se(f = json_variant_string(json_variant_by_index(v, i)));
+
+ for (size_t j = 0; j < ELEMENTSOF(mask_field); j++)
+ if (streq(f, mask_field[j].name)) { /* already covered in the loop above */
+ special = true;
+ continue;
+ }
+
+ if (!special) {
+ if ((load_flags & (USER_RECORD_ALLOW_REGULAR|USER_RECORD_REQUIRE_REGULAR)) == 0)
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record contains '%s' field, which is not allowed.", f);
+
+ m |= USER_RECORD_REGULAR;
+ break;
+ }
+ }
+
+ if (FLAGS_SET(load_flags, USER_RECORD_REQUIRE_REGULAR) && !FLAGS_SET(m, USER_RECORD_REGULAR))
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record lacks basic identity fields, which are required.");
+
+ if (!FLAGS_SET(load_flags, USER_RECORD_EMPTY_OK) && m == 0)
+ return json_log(v, json_flags, SYNTHETIC_ERRNO(EBADMSG), "Record is empty.");
+
+ if (w)
+ *ret_variant = TAKE_PTR(w);
+ else
+ *ret_variant = json_variant_ref(v);
+
+ *ret_mask = m;
+ return 0;
+}
+
+int user_record_load(UserRecord *h, JsonVariant *v, UserRecordLoadFlags load_flags) {
+
+ static const JsonDispatch user_dispatch_table[] = {
+ { "userName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(UserRecord, user_name), JSON_RELAX},
+ { "realm", JSON_VARIANT_STRING, json_dispatch_realm, offsetof(UserRecord, realm), 0 },
+ { "realName", JSON_VARIANT_STRING, json_dispatch_gecos, offsetof(UserRecord, real_name), 0 },
+ { "emailAddress", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, email_address), JSON_SAFE },
+ { "iconName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, icon_name), JSON_SAFE },
+ { "location", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, location), 0 },
+ { "disposition", JSON_VARIANT_STRING, json_dispatch_user_disposition, offsetof(UserRecord, disposition), 0 },
+ { "lastChangeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_change_usec), 0 },
+ { "lastPasswordChangeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, last_password_change_usec), 0 },
+ { "shell", JSON_VARIANT_STRING, json_dispatch_filename_or_path, offsetof(UserRecord, shell), 0 },
+ { "umask", JSON_VARIANT_UNSIGNED, json_dispatch_umask, offsetof(UserRecord, umask), 0 },
+ { "environment", JSON_VARIANT_ARRAY, json_dispatch_environment, offsetof(UserRecord, environment), 0 },
+ { "timeZone", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, time_zone), JSON_SAFE },
+ { "preferredLanguage", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, preferred_language), JSON_SAFE },
+ { "niceLevel", _JSON_VARIANT_TYPE_INVALID, json_dispatch_nice, offsetof(UserRecord, nice_level), 0 },
+ { "resourceLimits", _JSON_VARIANT_TYPE_INVALID, json_dispatch_rlimits, offsetof(UserRecord, rlimits), 0 },
+ { "locked", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, locked), 0 },
+ { "notBeforeUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_before_usec), 0 },
+ { "notAfterUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, not_after_usec), 0 },
+ { "storage", JSON_VARIANT_STRING, json_dispatch_storage, offsetof(UserRecord, storage), 0 },
+ { "diskSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size), 0 },
+ { "diskSizeRelative", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, disk_size_relative), 0 },
+ { "skeletonDirectory", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, skeleton_directory), 0 },
+ { "accessMode", JSON_VARIANT_UNSIGNED, json_dispatch_access_mode, offsetof(UserRecord, access_mode), 0 },
+ { "tasksMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, tasks_max), 0 },
+ { "memoryHigh", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_high), 0 },
+ { "memoryMax", JSON_VARIANT_UNSIGNED, json_dispatch_tasks_or_memory_max, offsetof(UserRecord, memory_max), 0 },
+ { "cpuWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, cpu_weight), 0 },
+ { "ioWeight", JSON_VARIANT_UNSIGNED, json_dispatch_weight, offsetof(UserRecord, io_weight), 0 },
+ { "mountNoDevices", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nodev), 0 },
+ { "mountNoSuid", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, nosuid), 0 },
+ { "mountNoExecute", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(UserRecord, noexec), 0 },
+ { "cifsDomain", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_domain), JSON_SAFE },
+ { "cifsUserName", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_user_name), JSON_SAFE },
+ { "cifsService", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_service), JSON_SAFE },
+ { "cifsExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, cifs_extra_mount_options), 0 },
+ { "imagePath", JSON_VARIANT_STRING, json_dispatch_path, offsetof(UserRecord, image_path), 0 },
+ { "homeDirectory", JSON_VARIANT_STRING, json_dispatch_home_directory, offsetof(UserRecord, home_directory), 0 },
+ { "uid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, uid), 0 },
+ { "gid", JSON_VARIANT_UNSIGNED, json_dispatch_uid_gid, offsetof(UserRecord, gid), 0 },
+ { "memberOf", JSON_VARIANT_ARRAY, json_dispatch_user_group_list, offsetof(UserRecord, member_of), JSON_RELAX},
+ { "capabilityBoundingSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_bounding_set), JSON_SAFE },
+ { "capabilityAmbientSet", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, capability_ambient_set), JSON_SAFE },
+ { "fileSystemType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, file_system_type), JSON_SAFE },
+ { "partitionUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, partition_uuid), 0 },
+ { "luksUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, luks_uuid), 0 },
+ { "fileSystemUuid", JSON_VARIANT_STRING, json_dispatch_id128, offsetof(UserRecord, file_system_uuid), 0 },
+ { "luksDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_discard), 0 },
+ { "luksOfflineDiscard", _JSON_VARIANT_TYPE_INVALID, json_dispatch_tristate, offsetof(UserRecord, luks_offline_discard), 0 },
+ { "luksCipher", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher), JSON_SAFE },
+ { "luksCipherMode", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_cipher_mode), JSON_SAFE },
+ { "luksVolumeKeySize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_volume_key_size), 0 },
+ { "luksPbkdfHashAlgorithm", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_hash_algorithm), JSON_SAFE },
+ { "luksPbkdfType", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_pbkdf_type), JSON_SAFE },
+ { "luksPbkdfForceIterations", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_force_iterations), 0 },
+ { "luksPbkdfTimeCostUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_time_cost_usec), 0 },
+ { "luksPbkdfMemoryCost", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_memory_cost), 0 },
+ { "luksPbkdfParallelThreads", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_pbkdf_parallel_threads), 0 },
+ { "luksSectorSize", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, luks_sector_size), 0 },
+ { "luksExtraMountOptions", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, luks_extra_mount_options), 0 },
+ { "dropCaches", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, drop_caches), 0 },
+ { "autoResizeMode", _JSON_VARIANT_TYPE_INVALID, dispatch_auto_resize_mode, offsetof(UserRecord, auto_resize_mode), 0 },
+ { "rebalanceWeight", _JSON_VARIANT_TYPE_INVALID, dispatch_rebalance_weight, offsetof(UserRecord, rebalance_weight), 0 },
+ { "service", JSON_VARIANT_STRING, json_dispatch_string, offsetof(UserRecord, service), JSON_SAFE },
+ { "rateLimitIntervalUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_interval_usec), 0 },
+ { "rateLimitBurst", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, ratelimit_burst), 0 },
+ { "enforcePasswordPolicy", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, enforce_password_policy), 0 },
+ { "autoLogin", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, auto_login), 0 },
+ { "stopDelayUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, stop_delay_usec), 0 },
+ { "killProcesses", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, kill_processes), 0 },
+ { "passwordChangeMinUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_min_usec), 0 },
+ { "passwordChangeMaxUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_max_usec), 0 },
+ { "passwordChangeWarnUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_warn_usec), 0 },
+ { "passwordChangeInactiveUSec", _JSON_VARIANT_TYPE_INVALID, json_dispatch_uint64, offsetof(UserRecord, password_change_inactive_usec), 0 },
+ { "passwordChangeNow", JSON_VARIANT_BOOLEAN, json_dispatch_tristate, offsetof(UserRecord, password_change_now), 0 },
+ { "pkcs11TokenUri", JSON_VARIANT_ARRAY, dispatch_pkcs11_uri_array, offsetof(UserRecord, pkcs11_token_uri), 0 },
+ { "fido2HmacCredential", JSON_VARIANT_ARRAY, dispatch_fido2_hmac_credential_array, 0, 0 },
+ { "recoveryKeyType", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(UserRecord, recovery_key_type), 0 },
+
+ { "secret", JSON_VARIANT_OBJECT, dispatch_secret, 0, 0 },
+ { "privileged", JSON_VARIANT_OBJECT, dispatch_privileged, 0, 0 },
+
+ /* Ignore the perMachine, binding, status stuff here, and process it later, so that it overrides whatever is set above */
+ { "perMachine", JSON_VARIANT_ARRAY, NULL, 0, 0 },
+ { "binding", JSON_VARIANT_OBJECT, NULL, 0, 0 },
+ { "status", JSON_VARIANT_OBJECT, NULL, 0, 0 },
+
+ /* Ignore 'signature', we check it with explicit accessors instead */
+ { "signature", JSON_VARIANT_ARRAY, NULL, 0, 0 },
+ {},
+ };
+
+ JsonDispatchFlags json_flags = USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(load_flags);
+ int r;
+
+ assert(h);
+ assert(!h->json);
+
+ /* Note that this call will leave a half-initialized record around on failure! */
+
+ r = user_group_record_mangle(v, load_flags, &h->json, &h->mask);
+ if (r < 0)
+ return r;
+
+ r = json_dispatch(h->json, user_dispatch_table, json_flags, h);
+ if (r < 0)
+ return r;
+
+ /* During the parsing operation above we ignored the 'perMachine', 'binding' and 'status' fields,
+ * since we want them to override the global options. Let's process them now. */
+
+ r = dispatch_per_machine("perMachine", json_variant_by_key(h->json, "perMachine"), json_flags, h);
+ if (r < 0)
+ return r;
+
+ r = dispatch_binding("binding", json_variant_by_key(h->json, "binding"), json_flags, h);
+ if (r < 0)
+ return r;
+
+ r = dispatch_status("status", json_variant_by_key(h->json, "status"), json_flags, h);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(h->mask, USER_RECORD_REGULAR) && !h->user_name)
+ return json_log(h->json, json_flags, SYNTHETIC_ERRNO(EINVAL), "User name field missing, refusing.");
+
+ r = user_record_augment(h, json_flags);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+int user_record_build(UserRecord **ret, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ _cleanup_(user_record_unrefp) UserRecord *u = NULL;
+ va_list ap;
+ int r;
+
+ assert(ret);
+
+ va_start(ap, ret);
+ r = json_buildv(&v, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return r;
+
+ u = user_record_new();
+ if (!u)
+ return -ENOMEM;
+
+ r = user_record_load(u, v, USER_RECORD_LOAD_FULL);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(u);
+ return 0;
+}
+
+const char *user_record_user_name_and_realm(UserRecord *h) {
+ assert(h);
+
+ /* Return the pre-initialized joined string if it is defined */
+ if (h->user_name_and_realm_auto)
+ return h->user_name_and_realm_auto;
+
+ /* If it's not defined then we cannot have a realm */
+ assert(!h->realm);
+ return h->user_name;
+}
+
+UserStorage user_record_storage(UserRecord *h) {
+ assert(h);
+
+ if (h->storage >= 0)
+ return h->storage;
+
+ return USER_CLASSIC;
+}
+
+const char *user_record_file_system_type(UserRecord *h) {
+ assert(h);
+
+ return h->file_system_type ?: "btrfs";
+}
+
+const char *user_record_skeleton_directory(UserRecord *h) {
+ assert(h);
+
+ return h->skeleton_directory ?: "/etc/skel";
+}
+
+mode_t user_record_access_mode(UserRecord *h) {
+ assert(h);
+
+ return h->access_mode != MODE_INVALID ? h->access_mode : 0700;
+}
+
+const char* user_record_home_directory(UserRecord *h) {
+ assert(h);
+
+ if (h->home_directory)
+ return h->home_directory;
+ if (h->home_directory_auto)
+ return h->home_directory_auto;
+
+ /* The root user is special, hence be special about it */
+ if (streq_ptr(h->user_name, "root"))
+ return "/root";
+
+ return "/";
+}
+
+const char *user_record_image_path(UserRecord *h) {
+ assert(h);
+
+ if (h->image_path)
+ return h->image_path;
+ if (h->image_path_auto)
+ return h->image_path_auto;
+
+ return IN_SET(user_record_storage(h), USER_CLASSIC, USER_DIRECTORY, USER_SUBVOLUME, USER_FSCRYPT) ? user_record_home_directory(h) : NULL;
+}
+
+const char *user_record_cifs_user_name(UserRecord *h) {
+ assert(h);
+
+ return h->cifs_user_name ?: h->user_name;
+}
+
+unsigned long user_record_mount_flags(UserRecord *h) {
+ assert(h);
+
+ return (h->nosuid ? MS_NOSUID : 0) |
+ (h->noexec ? MS_NOEXEC : 0) |
+ (h->nodev ? MS_NODEV : 0);
+}
+
+const char *user_record_shell(UserRecord *h) {
+ assert(h);
+
+ if (h->shell)
+ return h->shell;
+
+ if (streq_ptr(h->user_name, "root"))
+ return "/bin/sh";
+
+ if (user_record_disposition(h) == USER_REGULAR)
+ return DEFAULT_USER_SHELL;
+
+ return NOLOGIN;
+}
+
+const char *user_record_real_name(UserRecord *h) {
+ assert(h);
+
+ return h->real_name ?: h->user_name;
+}
+
+bool user_record_luks_discard(UserRecord *h) {
+ const char *ip;
+
+ assert(h);
+
+ if (h->luks_discard >= 0)
+ return h->luks_discard;
+
+ ip = user_record_image_path(h);
+ if (!ip)
+ return false;
+
+ /* Use discard by default if we are referring to a real block device, but not when operating on a
+ * loopback device. We want to optimize for SSD and flash storage after all, but we should be careful
+ * when storing stuff on top of regular file systems in loopback files as doing discard then would
+ * mean thin provisioning and we should not do that willy-nilly since it means we'll risk EIO later
+ * on should the disk space to back our file systems not be available. */
+
+ return path_startswith(ip, "/dev/");
+}
+
+bool user_record_luks_offline_discard(UserRecord *h) {
+ const char *ip;
+
+ assert(h);
+
+ if (h->luks_offline_discard >= 0)
+ return h->luks_offline_discard;
+
+ /* Discard while we are logged out should generally be a good idea, except when operating directly on
+ * physical media, where we should just bind it to the online discard mode. */
+
+ ip = user_record_image_path(h);
+ if (!ip)
+ return false;
+
+ if (path_startswith(ip, "/dev/"))
+ return user_record_luks_discard(h);
+
+ return true;
+}
+
+const char *user_record_luks_cipher(UserRecord *h) {
+ assert(h);
+
+ return h->luks_cipher ?: "aes";
+}
+
+const char *user_record_luks_cipher_mode(UserRecord *h) {
+ assert(h);
+
+ return h->luks_cipher_mode ?: "xts-plain64";
+}
+
+uint64_t user_record_luks_volume_key_size(UserRecord *h) {
+ assert(h);
+
+ /* We return a value here that can be cast without loss into size_t which is what libcrypsetup expects */
+
+ if (h->luks_volume_key_size == UINT64_MAX)
+ return 256 / 8;
+
+ return MIN(h->luks_volume_key_size, SIZE_MAX);
+}
+
+const char* user_record_luks_pbkdf_type(UserRecord *h) {
+ assert(h);
+
+ return h->luks_pbkdf_type ?: "argon2id";
+}
+
+uint64_t user_record_luks_pbkdf_force_iterations(UserRecord *h) {
+ assert(h);
+
+ /* propagate default "benchmark" mode as itself */
+ if (h->luks_pbkdf_force_iterations == UINT64_MAX)
+ return UINT64_MAX;
+
+ /* clamp everything else to actually accepted number of iterations of libcryptsetup */
+ return CLAMP(h->luks_pbkdf_force_iterations, 1U, UINT32_MAX);
+}
+
+uint64_t user_record_luks_pbkdf_time_cost_usec(UserRecord *h) {
+ assert(h);
+
+ /* Returns a value with ms granularity, since that's what libcryptsetup expects */
+
+ if (h->luks_pbkdf_time_cost_usec == UINT64_MAX)
+ return 500 * USEC_PER_MSEC; /* We default to 500ms, in contrast to libcryptsetup's 2s, which is just awfully slow on every login */
+
+ return MIN(DIV_ROUND_UP(h->luks_pbkdf_time_cost_usec, USEC_PER_MSEC), UINT32_MAX) * USEC_PER_MSEC;
+}
+
+uint64_t user_record_luks_pbkdf_memory_cost(UserRecord *h) {
+ assert(h);
+
+ /* Returns a value with kb granularity, since that's what libcryptsetup expects */
+ if (h->luks_pbkdf_memory_cost == UINT64_MAX)
+ return streq(user_record_luks_pbkdf_type(h), "pbkdf2") ? 0 : /* doesn't apply for simple pbkdf2 */
+ 64*1024*1024; /* We default to 64M, since this should work on smaller systems too */
+
+ return MIN(DIV_ROUND_UP(h->luks_pbkdf_memory_cost, 1024), UINT32_MAX) * 1024;
+}
+
+uint64_t user_record_luks_pbkdf_parallel_threads(UserRecord *h) {
+ assert(h);
+
+ if (h->luks_pbkdf_parallel_threads == UINT64_MAX)
+ return streq(user_record_luks_pbkdf_type(h), "pbkdf2") ? 0 : /* doesn't apply for simple pbkdf2 */
+ 1; /* We default to 1, since this should work on smaller systems too */
+
+ return MIN(h->luks_pbkdf_parallel_threads, UINT32_MAX);
+}
+
+uint64_t user_record_luks_sector_size(UserRecord *h) {
+ assert(h);
+
+ if (h->luks_sector_size == UINT64_MAX)
+ return 512;
+
+ /* Allow up to 4K due to dm-crypt support and 4K alignment by the homed LUKS backend */
+ return CLAMP(UINT64_C(1) << (63 - __builtin_clzl(h->luks_sector_size)), 512U, 4096U);
+}
+
+const char *user_record_luks_pbkdf_hash_algorithm(UserRecord *h) {
+ assert(h);
+
+ return h->luks_pbkdf_hash_algorithm ?: "sha512";
+}
+
+gid_t user_record_gid(UserRecord *h) {
+ assert(h);
+
+ if (gid_is_valid(h->gid))
+ return h->gid;
+
+ return (gid_t) h->uid;
+}
+
+UserDisposition user_record_disposition(UserRecord *h) {
+ assert(h);
+
+ if (h->disposition >= 0)
+ return h->disposition;
+
+ /* If not declared, derive from UID */
+
+ if (!uid_is_valid(h->uid))
+ return _USER_DISPOSITION_INVALID;
+
+ if (h->uid == 0 || h->uid == UID_NOBODY)
+ return USER_INTRINSIC;
+
+ if (uid_is_system(h->uid))
+ return USER_SYSTEM;
+
+ if (uid_is_dynamic(h->uid))
+ return USER_DYNAMIC;
+
+ if (uid_is_container(h->uid))
+ return USER_CONTAINER;
+
+ if (h->uid > INT32_MAX)
+ return USER_RESERVED;
+
+ return USER_REGULAR;
+}
+
+int user_record_removable(UserRecord *h) {
+ UserStorage storage;
+ assert(h);
+
+ if (h->removable >= 0)
+ return h->removable;
+
+ /* Refuse to decide for classic records */
+ storage = user_record_storage(h);
+ if (h->storage < 0 || h->storage == USER_CLASSIC)
+ return -1;
+
+ /* For now consider only LUKS home directories with a reference by path as removable */
+ return storage == USER_LUKS && path_startswith(user_record_image_path(h), "/dev/");
+}
+
+uint64_t user_record_ratelimit_interval_usec(UserRecord *h) {
+ assert(h);
+
+ if (h->ratelimit_interval_usec == UINT64_MAX)
+ return DEFAULT_RATELIMIT_INTERVAL_USEC;
+
+ return h->ratelimit_interval_usec;
+}
+
+uint64_t user_record_ratelimit_burst(UserRecord *h) {
+ assert(h);
+
+ if (h->ratelimit_burst == UINT64_MAX)
+ return DEFAULT_RATELIMIT_BURST;
+
+ return h->ratelimit_burst;
+}
+
+bool user_record_can_authenticate(UserRecord *h) {
+ assert(h);
+
+ /* Returns true if there's some form of property configured that the user can authenticate against */
+
+ if (h->n_pkcs11_encrypted_key > 0)
+ return true;
+
+ if (h->n_fido2_hmac_salt > 0)
+ return true;
+
+ return !strv_isempty(h->hashed_password);
+}
+
+bool user_record_drop_caches(UserRecord *h) {
+ assert(h);
+
+ if (h->drop_caches >= 0)
+ return h->drop_caches;
+
+ /* By default drop caches on fscrypt, not otherwise. */
+ return user_record_storage(h) == USER_FSCRYPT;
+}
+
+AutoResizeMode user_record_auto_resize_mode(UserRecord *h) {
+ assert(h);
+
+ if (h->auto_resize_mode >= 0)
+ return h->auto_resize_mode;
+
+ return user_record_storage(h) == USER_LUKS ? AUTO_RESIZE_SHRINK_AND_GROW : AUTO_RESIZE_OFF;
+}
+
+uint64_t user_record_rebalance_weight(UserRecord *h) {
+ assert(h);
+
+ if (h->rebalance_weight == REBALANCE_WEIGHT_UNSET)
+ return REBALANCE_WEIGHT_DEFAULT;
+
+ return h->rebalance_weight;
+}
+
+static uint64_t parse_caps_strv(char **l) {
+ uint64_t c = 0;
+ int r;
+
+ STRV_FOREACH(i, l) {
+ r = capability_from_name(*i);
+ if (r < 0)
+ log_debug_errno(r, "Don't know capability '%s', ignoring: %m", *i);
+ else
+ c |= UINT64_C(1) << r;
+ }
+
+ return c;
+}
+
+uint64_t user_record_capability_bounding_set(UserRecord *h) {
+ assert(h);
+
+ /* Returns UINT64_MAX if no bounding set is configured (!) */
+
+ if (!h->capability_bounding_set)
+ return UINT64_MAX;
+
+ return parse_caps_strv(h->capability_bounding_set);
+}
+
+uint64_t user_record_capability_ambient_set(UserRecord *h) {
+ assert(h);
+
+ /* Returns UINT64_MAX if no ambient set is configured (!) */
+
+ if (!h->capability_ambient_set)
+ return UINT64_MAX;
+
+ return parse_caps_strv(h->capability_ambient_set) & user_record_capability_bounding_set(h);
+}
+
+uint64_t user_record_ratelimit_next_try(UserRecord *h) {
+ assert(h);
+
+ /* Calculates when the it's possible to login next. Returns:
+ *
+ * UINT64_MAX → Nothing known
+ * 0 → Right away
+ * Any other → Next time in CLOCK_REALTIME in usec (which could be in the past)
+ */
+
+ if (h->ratelimit_begin_usec == UINT64_MAX ||
+ h->ratelimit_count == UINT64_MAX)
+ return UINT64_MAX;
+
+ if (h->ratelimit_begin_usec > now(CLOCK_REALTIME)) /* If the ratelimit time is in the future, then
+ * the local clock is probably incorrect. Let's
+ * not refuse login then. */
+ return UINT64_MAX;
+
+ if (h->ratelimit_count < user_record_ratelimit_burst(h))
+ return 0;
+
+ return usec_add(h->ratelimit_begin_usec, user_record_ratelimit_interval_usec(h));
+}
+
+bool user_record_equal(UserRecord *a, UserRecord *b) {
+ assert(a);
+ assert(b);
+
+ /* We assume that when a record is modified its JSON data is updated at the same time, hence it's
+ * sufficient to compare the JSON data. */
+
+ return json_variant_equal(a->json, b->json);
+}
+
+bool user_record_compatible(UserRecord *a, UserRecord *b) {
+ assert(a);
+ assert(b);
+
+ /* If either lacks the regular section, we can't really decide, let's hence say they are
+ * incompatible. */
+ if (!(a->mask & b->mask & USER_RECORD_REGULAR))
+ return false;
+
+ return streq_ptr(a->user_name, b->user_name) &&
+ streq_ptr(a->realm, b->realm);
+}
+
+int user_record_compare_last_change(UserRecord *a, UserRecord *b) {
+ assert(a);
+ assert(b);
+
+ if (a->last_change_usec == b->last_change_usec)
+ return 0;
+
+ /* Always consider a record with a timestamp newer than one without */
+ if (a->last_change_usec == UINT64_MAX)
+ return -1;
+ if (b->last_change_usec == UINT64_MAX)
+ return 1;
+
+ return CMP(a->last_change_usec, b->last_change_usec);
+}
+
+int user_record_clone(UserRecord *h, UserRecordLoadFlags flags, UserRecord **ret) {
+ _cleanup_(user_record_unrefp) UserRecord *c = NULL;
+ int r;
+
+ assert(h);
+ assert(ret);
+
+ c = user_record_new();
+ if (!c)
+ return -ENOMEM;
+
+ r = user_record_load(c, h->json, flags);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(c);
+ return 0;
+}
+
+int user_record_masked_equal(UserRecord *a, UserRecord *b, UserRecordMask mask) {
+ _cleanup_(user_record_unrefp) UserRecord *x = NULL, *y = NULL;
+ int r;
+
+ assert(a);
+ assert(b);
+
+ /* Compares the two records, but ignores anything not listed in the specified mask */
+
+ if ((a->mask & ~mask) != 0) {
+ r = user_record_clone(a, USER_RECORD_ALLOW(mask) | USER_RECORD_STRIP(~mask & _USER_RECORD_MASK_MAX) | USER_RECORD_PERMISSIVE, &x);
+ if (r < 0)
+ return r;
+
+ a = x;
+ }
+
+ if ((b->mask & ~mask) != 0) {
+ r = user_record_clone(b, USER_RECORD_ALLOW(mask) | USER_RECORD_STRIP(~mask & _USER_RECORD_MASK_MAX) | USER_RECORD_PERMISSIVE, &y);
+ if (r < 0)
+ return r;
+
+ b = y;
+ }
+
+ return user_record_equal(a, b);
+}
+
+int user_record_test_blocked(UserRecord *h) {
+ usec_t n;
+
+ /* Checks whether access to the specified user shall be allowed at the moment. Returns:
+ *
+ * -ESTALE: Record is from the future
+ * -ENOLCK: Record is blocked
+ * -EL2HLT: Record is not valid yet
+ * -EL3HLT: Record is not valid anymore
+ *
+ */
+
+ assert(h);
+
+ if (h->locked > 0)
+ return -ENOLCK;
+
+ n = now(CLOCK_REALTIME);
+
+ if (h->not_before_usec != UINT64_MAX && n < h->not_before_usec)
+ return -EL2HLT;
+ if (h->not_after_usec != UINT64_MAX && n > h->not_after_usec)
+ return -EL3HLT;
+
+ if (h->last_change_usec != UINT64_MAX &&
+ h->last_change_usec > n) /* Complain during log-ins when the record is from the future */
+ return -ESTALE;
+
+ return 0;
+}
+
+int user_record_test_password_change_required(UserRecord *h) {
+ bool change_permitted;
+ usec_t n;
+
+ assert(h);
+
+ /* Checks whether the user must change the password when logging in
+
+ -EKEYREVOKED: Change password now because admin said so
+ -EOWNERDEAD: Change password now because it expired
+ -EKEYREJECTED: Password is expired, no changing is allowed
+ -EKEYEXPIRED: Password is about to expire, warn user
+ -ENETDOWN: Record has expiration info but no password change timestamp
+ -EROFS: No password change required nor permitted
+ -ESTALE: RTC likely incorrect, last password change is in the future
+ 0: No password change required, but permitted
+ */
+
+ /* If a password change request has been set explicitly, it overrides everything */
+ if (h->password_change_now > 0)
+ return -EKEYREVOKED;
+
+ n = now(CLOCK_REALTIME);
+
+ /* Password change in the future? Then our RTC is likely incorrect */
+ if (h->last_password_change_usec != UINT64_MAX &&
+ h->last_password_change_usec > n &&
+ (h->password_change_min_usec != UINT64_MAX ||
+ h->password_change_max_usec != UINT64_MAX ||
+ h->password_change_inactive_usec != UINT64_MAX))
+ return -ESTALE;
+
+ /* Then, let's check if password changing is currently allowed at all */
+ if (h->password_change_min_usec != UINT64_MAX) {
+
+ /* Expiry configured but no password change timestamp known? */
+ if (h->last_password_change_usec == UINT64_MAX)
+ return -ENETDOWN;
+
+ if (h->password_change_min_usec >= UINT64_MAX - h->last_password_change_usec)
+ change_permitted = false;
+ else
+ change_permitted = n >= h->last_password_change_usec + h->password_change_min_usec;
+
+ } else
+ change_permitted = true;
+
+ /* Let's check whether the password has expired. */
+ if (!(h->password_change_max_usec == UINT64_MAX ||
+ h->password_change_max_usec >= UINT64_MAX - h->last_password_change_usec)) {
+
+ uint64_t change_before;
+
+ /* Expiry configured but no password change timestamp known? */
+ if (h->last_password_change_usec == UINT64_MAX)
+ return -ENETDOWN;
+
+ /* Password is in inactive phase? */
+ if (h->password_change_inactive_usec != UINT64_MAX &&
+ h->password_change_inactive_usec < UINT64_MAX - h->password_change_max_usec) {
+ usec_t added;
+
+ added = h->password_change_inactive_usec + h->password_change_max_usec;
+ if (added < UINT64_MAX - h->last_password_change_usec &&
+ n >= h->last_password_change_usec + added)
+ return -EKEYREJECTED;
+ }
+
+ /* Password needs to be changed now? */
+ change_before = h->last_password_change_usec + h->password_change_max_usec;
+ if (n >= change_before)
+ return change_permitted ? -EOWNERDEAD : -EKEYREJECTED;
+
+ /* Warn user? */
+ if (h->password_change_warn_usec != UINT64_MAX &&
+ (change_before < h->password_change_warn_usec ||
+ n >= change_before - h->password_change_warn_usec))
+ return change_permitted ? -EKEYEXPIRED : -EROFS;
+ }
+
+ /* No password changing necessary */
+ return change_permitted ? 0 : -EROFS;
+}
+
+static const char* const user_storage_table[_USER_STORAGE_MAX] = {
+ [USER_CLASSIC] = "classic",
+ [USER_LUKS] = "luks",
+ [USER_DIRECTORY] = "directory",
+ [USER_SUBVOLUME] = "subvolume",
+ [USER_FSCRYPT] = "fscrypt",
+ [USER_CIFS] = "cifs",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(user_storage, UserStorage);
+
+static const char* const user_disposition_table[_USER_DISPOSITION_MAX] = {
+ [USER_INTRINSIC] = "intrinsic",
+ [USER_SYSTEM] = "system",
+ [USER_DYNAMIC] = "dynamic",
+ [USER_REGULAR] = "regular",
+ [USER_CONTAINER] = "container",
+ [USER_RESERVED] = "reserved",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(user_disposition, UserDisposition);
+
+static const char* const auto_resize_mode_table[_AUTO_RESIZE_MODE_MAX] = {
+ [AUTO_RESIZE_OFF] = "off",
+ [AUTO_RESIZE_GROW] = "grow",
+ [AUTO_RESIZE_SHRINK_AND_GROW] = "shrink-and-grow",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(auto_resize_mode, AutoResizeMode);
diff --git a/src/shared/user-record.h b/src/shared/user-record.h
new file mode 100644
index 0000000..298dc24
--- /dev/null
+++ b/src/shared/user-record.h
@@ -0,0 +1,450 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <inttypes.h>
+#include <sys/types.h>
+
+#include "sd-id128.h"
+
+#include "json.h"
+#include "missing_resource.h"
+#include "time-util.h"
+
+typedef enum UserDisposition {
+ USER_INTRINSIC, /* root and nobody */
+ USER_SYSTEM, /* statically allocated users for system services */
+ USER_DYNAMIC, /* dynamically allocated users for system services */
+ USER_REGULAR, /* regular (typically human users) */
+ USER_CONTAINER, /* UID ranges allocated for container uses */
+ USER_RESERVED, /* Range above 2^31 */
+ _USER_DISPOSITION_MAX,
+ _USER_DISPOSITION_INVALID = -EINVAL,
+} UserDisposition;
+
+typedef enum UserHomeStorage {
+ USER_CLASSIC,
+ USER_LUKS,
+ USER_DIRECTORY, /* A directory, and a .identity file in it, which USER_CLASSIC lacks */
+ USER_SUBVOLUME,
+ USER_FSCRYPT,
+ USER_CIFS,
+ _USER_STORAGE_MAX,
+ _USER_STORAGE_INVALID = -EINVAL,
+} UserStorage;
+
+typedef enum UserRecordMask {
+ /* The various sections an identity record may have, as bit mask */
+ USER_RECORD_REGULAR = 1U << 0,
+ USER_RECORD_SECRET = 1U << 1,
+ USER_RECORD_PRIVILEGED = 1U << 2,
+ USER_RECORD_PER_MACHINE = 1U << 3,
+ USER_RECORD_BINDING = 1U << 4,
+ USER_RECORD_STATUS = 1U << 5,
+ USER_RECORD_SIGNATURE = 1U << 6,
+ _USER_RECORD_MASK_MAX = (1U << 7)-1
+} UserRecordMask;
+
+typedef enum UserRecordLoadFlags {
+ /* A set of flags used while loading a user record from JSON data. We leave the lower 6 bits free,
+ * just as a safety precaution so that we can detect borked conversions between UserRecordMask and
+ * UserRecordLoadFlags. */
+
+ /* What to require */
+ USER_RECORD_REQUIRE_REGULAR = USER_RECORD_REGULAR << 7,
+ USER_RECORD_REQUIRE_SECRET = USER_RECORD_SECRET << 7,
+ USER_RECORD_REQUIRE_PRIVILEGED = USER_RECORD_PRIVILEGED << 7,
+ USER_RECORD_REQUIRE_PER_MACHINE = USER_RECORD_PER_MACHINE << 7,
+ USER_RECORD_REQUIRE_BINDING = USER_RECORD_BINDING << 7,
+ USER_RECORD_REQUIRE_STATUS = USER_RECORD_STATUS << 7,
+ USER_RECORD_REQUIRE_SIGNATURE = USER_RECORD_SIGNATURE << 7,
+
+ /* What to allow */
+ USER_RECORD_ALLOW_REGULAR = USER_RECORD_REGULAR << 14,
+ USER_RECORD_ALLOW_SECRET = USER_RECORD_SECRET << 14,
+ USER_RECORD_ALLOW_PRIVILEGED = USER_RECORD_PRIVILEGED << 14,
+ USER_RECORD_ALLOW_PER_MACHINE = USER_RECORD_PER_MACHINE << 14,
+ USER_RECORD_ALLOW_BINDING = USER_RECORD_BINDING << 14,
+ USER_RECORD_ALLOW_STATUS = USER_RECORD_STATUS << 14,
+ USER_RECORD_ALLOW_SIGNATURE = USER_RECORD_SIGNATURE << 14,
+
+ /* What to strip */
+ USER_RECORD_STRIP_REGULAR = USER_RECORD_REGULAR << 21,
+ USER_RECORD_STRIP_SECRET = USER_RECORD_SECRET << 21,
+ USER_RECORD_STRIP_PRIVILEGED = USER_RECORD_PRIVILEGED << 21,
+ USER_RECORD_STRIP_PER_MACHINE = USER_RECORD_PER_MACHINE << 21,
+ USER_RECORD_STRIP_BINDING = USER_RECORD_BINDING << 21,
+ USER_RECORD_STRIP_STATUS = USER_RECORD_STATUS << 21,
+ USER_RECORD_STRIP_SIGNATURE = USER_RECORD_SIGNATURE << 21,
+
+ /* Some special combinations that deserve explicit names */
+ USER_RECORD_LOAD_FULL = USER_RECORD_REQUIRE_REGULAR |
+ USER_RECORD_ALLOW_SECRET |
+ USER_RECORD_ALLOW_PRIVILEGED |
+ USER_RECORD_ALLOW_PER_MACHINE |
+ USER_RECORD_ALLOW_BINDING |
+ USER_RECORD_ALLOW_STATUS |
+ USER_RECORD_ALLOW_SIGNATURE,
+
+ USER_RECORD_LOAD_REFUSE_SECRET = USER_RECORD_REQUIRE_REGULAR |
+ USER_RECORD_ALLOW_PRIVILEGED |
+ USER_RECORD_ALLOW_PER_MACHINE |
+ USER_RECORD_ALLOW_BINDING |
+ USER_RECORD_ALLOW_STATUS |
+ USER_RECORD_ALLOW_SIGNATURE,
+
+ USER_RECORD_LOAD_MASK_SECRET = USER_RECORD_REQUIRE_REGULAR |
+ USER_RECORD_ALLOW_PRIVILEGED |
+ USER_RECORD_ALLOW_PER_MACHINE |
+ USER_RECORD_ALLOW_BINDING |
+ USER_RECORD_ALLOW_STATUS |
+ USER_RECORD_ALLOW_SIGNATURE |
+ USER_RECORD_STRIP_SECRET,
+
+ USER_RECORD_EXTRACT_SECRET = USER_RECORD_REQUIRE_SECRET |
+ USER_RECORD_STRIP_REGULAR |
+ USER_RECORD_STRIP_PRIVILEGED |
+ USER_RECORD_STRIP_PER_MACHINE |
+ USER_RECORD_STRIP_BINDING |
+ USER_RECORD_STRIP_STATUS |
+ USER_RECORD_STRIP_SIGNATURE,
+
+ USER_RECORD_LOAD_SIGNABLE = USER_RECORD_REQUIRE_REGULAR |
+ USER_RECORD_ALLOW_PRIVILEGED |
+ USER_RECORD_ALLOW_PER_MACHINE,
+
+ USER_RECORD_EXTRACT_SIGNABLE = USER_RECORD_LOAD_SIGNABLE |
+ USER_RECORD_STRIP_SECRET |
+ USER_RECORD_STRIP_BINDING |
+ USER_RECORD_STRIP_STATUS |
+ USER_RECORD_STRIP_SIGNATURE,
+
+ USER_RECORD_LOAD_EMBEDDED = USER_RECORD_REQUIRE_REGULAR |
+ USER_RECORD_ALLOW_PRIVILEGED |
+ USER_RECORD_ALLOW_PER_MACHINE |
+ USER_RECORD_ALLOW_SIGNATURE,
+
+ USER_RECORD_EXTRACT_EMBEDDED = USER_RECORD_LOAD_EMBEDDED |
+ USER_RECORD_STRIP_SECRET |
+ USER_RECORD_STRIP_BINDING |
+ USER_RECORD_STRIP_STATUS,
+
+ /* Whether to log about loader errors beyond LOG_DEBUG */
+ USER_RECORD_LOG = 1U << 28,
+
+ /* Whether to ignore errors and load what we can */
+ USER_RECORD_PERMISSIVE = 1U << 29,
+
+ /* Whether an empty record is OK */
+ USER_RECORD_EMPTY_OK = 1U << 30,
+} UserRecordLoadFlags;
+
+static inline UserRecordLoadFlags USER_RECORD_REQUIRE(UserRecordMask m) {
+ assert((m & ~_USER_RECORD_MASK_MAX) == 0);
+ return m << 7;
+}
+
+static inline UserRecordLoadFlags USER_RECORD_ALLOW(UserRecordMask m) {
+ assert((m & ~_USER_RECORD_MASK_MAX) == 0);
+ return m << 14;
+}
+
+static inline UserRecordLoadFlags USER_RECORD_STRIP(UserRecordMask m) {
+ assert((m & ~_USER_RECORD_MASK_MAX) == 0);
+ return m << 21;
+}
+
+static inline UserRecordMask USER_RECORD_REQUIRE_MASK(UserRecordLoadFlags f) {
+ return (f >> 7) & _USER_RECORD_MASK_MAX;
+}
+
+static inline UserRecordMask USER_RECORD_ALLOW_MASK(UserRecordLoadFlags f) {
+ return ((f >> 14) & _USER_RECORD_MASK_MAX) | USER_RECORD_REQUIRE_MASK(f);
+}
+
+static inline UserRecordMask USER_RECORD_STRIP_MASK(UserRecordLoadFlags f) {
+ return (f >> 21) & _USER_RECORD_MASK_MAX;
+}
+
+static inline JsonDispatchFlags USER_RECORD_LOAD_FLAGS_TO_JSON_DISPATCH_FLAGS(UserRecordLoadFlags flags) {
+ return (FLAGS_SET(flags, USER_RECORD_LOG) ? JSON_LOG : 0) |
+ (FLAGS_SET(flags, USER_RECORD_PERMISSIVE) ? JSON_PERMISSIVE : 0);
+}
+
+typedef struct Pkcs11EncryptedKey {
+ /* The encrypted passphrase, which can be decrypted with the private key indicated below */
+ void *data;
+ size_t size;
+
+ /* Where to find the private key to decrypt the encrypted passphrase above */
+ char *uri;
+
+ /* What to test the decrypted passphrase against to allow access (classic UNIX password hash). Note
+ * that the decrypted passphrase is also used for unlocking LUKS and fscrypt, and if the account is
+ * backed by LUKS or fscrypt the hashed password is only an additional layer of authentication, not
+ * the only. */
+ char *hashed_password;
+} Pkcs11EncryptedKey;
+
+typedef struct Fido2HmacCredential {
+ void *id;
+ size_t size;
+} Fido2HmacCredential;
+
+typedef struct Fido2HmacSalt {
+ /* The FIDO2 Cridential ID to use */
+ Fido2HmacCredential credential;
+
+ /* The FIDO2 salt value */
+ void *salt;
+ size_t salt_size;
+
+ /* What to test the hashed salt value against, usually UNIX password hash here. */
+ char *hashed_password;
+
+ /* Whether the 'up', 'uv', 'clientPin' features are enabled. */
+ int uv, up, client_pin;
+} Fido2HmacSalt;
+
+typedef struct RecoveryKey {
+ /* The type of recovery key, must be "modhex64" right now */
+ char *type;
+
+ /* A UNIX password hash of the normalized form of modhex64 */
+ char *hashed_password;
+} RecoveryKey;
+
+typedef enum AutoResizeMode {
+ AUTO_RESIZE_OFF, /* no automatic grow/shrink */
+ AUTO_RESIZE_GROW, /* grow at login */
+ AUTO_RESIZE_SHRINK_AND_GROW, /* shrink at logout + grow at login */
+ _AUTO_RESIZE_MODE_MAX,
+ _AUTO_RESIZE_MODE_INVALID = -EINVAL,
+} AutoResizeMode;
+
+#define REBALANCE_WEIGHT_OFF UINT64_C(0)
+#define REBALANCE_WEIGHT_DEFAULT UINT64_C(100)
+#define REBALANCE_WEIGHT_BACKING UINT64_C(20)
+#define REBALANCE_WEIGHT_MIN UINT64_C(1)
+#define REBALANCE_WEIGHT_MAX UINT64_C(10000)
+#define REBALANCE_WEIGHT_UNSET UINT64_MAX
+
+typedef struct UserRecord {
+ /* The following three fields are not part of the JSON record */
+ unsigned n_ref;
+ UserRecordMask mask;
+ bool incomplete; /* incomplete due to security restrictions. */
+
+ char *user_name;
+ char *realm;
+ char *user_name_and_realm_auto; /* the user_name field concatenated with '@' and the realm, if the latter is defined */
+ char *real_name;
+ char *email_address;
+ char *password_hint;
+ char *icon_name;
+ char *location;
+
+ UserDisposition disposition;
+ uint64_t last_change_usec;
+ uint64_t last_password_change_usec;
+
+ char *shell;
+ mode_t umask;
+ char **environment;
+ char *time_zone;
+ char *preferred_language;
+ int nice_level;
+ struct rlimit *rlimits[_RLIMIT_MAX];
+
+ int locked; /* prohibit activation in general */
+ uint64_t not_before_usec; /* prohibit activation before this unix time */
+ uint64_t not_after_usec; /* prohibit activation after this unix time */
+
+ UserStorage storage;
+ uint64_t disk_size;
+ uint64_t disk_size_relative; /* Disk size, relative to the free bytes of the medium, normalized to UINT32_MAX = 100% */
+ char *skeleton_directory;
+ mode_t access_mode;
+ AutoResizeMode auto_resize_mode;
+ uint64_t rebalance_weight;
+
+ uint64_t tasks_max;
+ uint64_t memory_high;
+ uint64_t memory_max;
+ uint64_t cpu_weight;
+ uint64_t io_weight;
+
+ bool nosuid;
+ bool nodev;
+ bool noexec;
+
+ char **hashed_password;
+ char **ssh_authorized_keys;
+ char **password;
+ char **token_pin;
+
+ char *cifs_domain;
+ char *cifs_user_name;
+ char *cifs_service;
+ char *cifs_extra_mount_options;
+
+ char *image_path;
+ char *image_path_auto; /* when none is configured explicitly, this is where we place the implicit image */
+ char *home_directory;
+ char *home_directory_auto; /* when none is set explicitly, this is where we place the implicit home directory */
+
+ uid_t uid;
+ gid_t gid;
+
+ char **member_of;
+
+ char *file_system_type;
+ sd_id128_t partition_uuid;
+ sd_id128_t luks_uuid;
+ sd_id128_t file_system_uuid;
+
+ int luks_discard;
+ int luks_offline_discard;
+ char *luks_cipher;
+ char *luks_cipher_mode;
+ uint64_t luks_volume_key_size;
+ char *luks_pbkdf_hash_algorithm;
+ char *luks_pbkdf_type;
+ uint64_t luks_pbkdf_force_iterations;
+ uint64_t luks_pbkdf_time_cost_usec;
+ uint64_t luks_pbkdf_memory_cost;
+ uint64_t luks_pbkdf_parallel_threads;
+ uint64_t luks_sector_size;
+ char *luks_extra_mount_options;
+
+ uint64_t disk_usage;
+ uint64_t disk_free;
+ uint64_t disk_ceiling;
+ uint64_t disk_floor;
+
+ char *state;
+ char *service;
+ int signed_locally;
+
+ uint64_t good_authentication_counter;
+ uint64_t bad_authentication_counter;
+ uint64_t last_good_authentication_usec;
+ uint64_t last_bad_authentication_usec;
+
+ uint64_t ratelimit_begin_usec;
+ uint64_t ratelimit_count;
+ uint64_t ratelimit_interval_usec;
+ uint64_t ratelimit_burst;
+
+ int removable;
+ int enforce_password_policy;
+ int auto_login;
+ int drop_caches;
+
+ uint64_t stop_delay_usec; /* How long to leave systemd --user around on log-out */
+ int kill_processes; /* Whether to kill user processes forcibly on log-out */
+
+ /* The following exist mostly so that we can cover the full /etc/shadow set of fields */
+ uint64_t password_change_min_usec; /* maps to .sp_min */
+ uint64_t password_change_max_usec; /* maps to .sp_max */
+ uint64_t password_change_warn_usec; /* maps to .sp_warn */
+ uint64_t password_change_inactive_usec; /* maps to .sp_inact */
+ int password_change_now; /* Require a password change immediately on next login (.sp_lstchg = 0) */
+
+ char **pkcs11_token_uri;
+ Pkcs11EncryptedKey *pkcs11_encrypted_key;
+ size_t n_pkcs11_encrypted_key;
+ int pkcs11_protected_authentication_path_permitted;
+
+ Fido2HmacCredential *fido2_hmac_credential;
+ size_t n_fido2_hmac_credential;
+ Fido2HmacSalt *fido2_hmac_salt;
+ size_t n_fido2_hmac_salt;
+ int fido2_user_presence_permitted;
+ int fido2_user_verification_permitted;
+
+ char **recovery_key_type;
+ RecoveryKey *recovery_key;
+ size_t n_recovery_key;
+
+ char **capability_bounding_set;
+ char **capability_ambient_set;
+
+ JsonVariant *json;
+} UserRecord;
+
+UserRecord* user_record_new(void);
+UserRecord* user_record_ref(UserRecord *h);
+UserRecord* user_record_unref(UserRecord *h);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(UserRecord*, user_record_unref);
+
+int user_record_load(UserRecord *h, JsonVariant *v, UserRecordLoadFlags flags);
+int user_record_build(UserRecord **ret, ...);
+
+const char *user_record_user_name_and_realm(UserRecord *h);
+UserStorage user_record_storage(UserRecord *h);
+const char *user_record_file_system_type(UserRecord *h);
+const char *user_record_skeleton_directory(UserRecord *h);
+mode_t user_record_access_mode(UserRecord *h);
+const char *user_record_home_directory(UserRecord *h);
+const char *user_record_image_path(UserRecord *h);
+unsigned long user_record_mount_flags(UserRecord *h);
+const char *user_record_cifs_user_name(UserRecord *h);
+const char *user_record_shell(UserRecord *h);
+const char *user_record_real_name(UserRecord *h);
+bool user_record_luks_discard(UserRecord *h);
+bool user_record_luks_offline_discard(UserRecord *h);
+const char *user_record_luks_cipher(UserRecord *h);
+const char *user_record_luks_cipher_mode(UserRecord *h);
+uint64_t user_record_luks_volume_key_size(UserRecord *h);
+const char* user_record_luks_pbkdf_type(UserRecord *h);
+uint64_t user_record_luks_pbkdf_force_iterations(UserRecord *h);
+usec_t user_record_luks_pbkdf_time_cost_usec(UserRecord *h);
+uint64_t user_record_luks_pbkdf_memory_cost(UserRecord *h);
+uint64_t user_record_luks_pbkdf_parallel_threads(UserRecord *h);
+uint64_t user_record_luks_sector_size(UserRecord *h);
+const char *user_record_luks_pbkdf_hash_algorithm(UserRecord *h);
+gid_t user_record_gid(UserRecord *h);
+UserDisposition user_record_disposition(UserRecord *h);
+int user_record_removable(UserRecord *h);
+usec_t user_record_ratelimit_interval_usec(UserRecord *h);
+uint64_t user_record_ratelimit_burst(UserRecord *h);
+bool user_record_can_authenticate(UserRecord *h);
+bool user_record_drop_caches(UserRecord *h);
+AutoResizeMode user_record_auto_resize_mode(UserRecord *h);
+uint64_t user_record_rebalance_weight(UserRecord *h);
+uint64_t user_record_capability_bounding_set(UserRecord *h);
+uint64_t user_record_capability_ambient_set(UserRecord *h);
+
+int user_record_build_image_path(UserStorage storage, const char *user_name_and_realm, char **ret);
+
+bool user_record_equal(UserRecord *a, UserRecord *b);
+bool user_record_compatible(UserRecord *a, UserRecord *b);
+int user_record_compare_last_change(UserRecord *a, UserRecord *b);
+
+usec_t user_record_ratelimit_next_try(UserRecord *h);
+
+int user_record_clone(UserRecord *h, UserRecordLoadFlags flags, UserRecord **ret);
+int user_record_masked_equal(UserRecord *a, UserRecord *b, UserRecordMask mask);
+
+int user_record_test_blocked(UserRecord *h);
+int user_record_test_password_change_required(UserRecord *h);
+
+/* The following six are user by group-record.c, that's why we export them here */
+int json_dispatch_realm(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_gecos(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_user_group_list(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+int json_dispatch_user_disposition(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata);
+
+int per_machine_id_match(JsonVariant *ids, JsonDispatchFlags flags);
+int per_machine_hostname_match(JsonVariant *hns, JsonDispatchFlags flags);
+int user_group_record_mangle(JsonVariant *v, UserRecordLoadFlags load_flags, JsonVariant **ret_variant, UserRecordMask *ret_mask);
+
+const char* user_storage_to_string(UserStorage t) _const_;
+UserStorage user_storage_from_string(const char *s) _pure_;
+
+const char* user_disposition_to_string(UserDisposition t) _const_;
+UserDisposition user_disposition_from_string(const char *s) _pure_;
+
+const char* auto_resize_mode_to_string(AutoResizeMode m) _const_;
+AutoResizeMode auto_resize_mode_from_string(const char *s) _pure_;
diff --git a/src/shared/userdb-dropin.c b/src/shared/userdb-dropin.c
new file mode 100644
index 0000000..a2d48fa
--- /dev/null
+++ b/src/shared/userdb-dropin.c
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "path-util.h"
+#include "stdio-util.h"
+#include "user-util.h"
+#include "userdb-dropin.h"
+
+static int load_user(
+ FILE *f,
+ const char *path,
+ const char *name,
+ uid_t uid,
+ UserDBFlags flags,
+ UserRecord **ret) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ _cleanup_(user_record_unrefp) UserRecord *u = NULL;
+ bool have_privileged;
+ int r;
+
+ assert(f);
+
+ r = json_parse_file(f, path, 0, &v, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW) || !path || !(name || uid_is_valid(uid)))
+ have_privileged = false;
+ else {
+ _cleanup_(json_variant_unrefp) JsonVariant *privileged_v = NULL;
+ _cleanup_free_ char *d = NULL, *j = NULL;
+
+ /* Let's load the "privileged" section from a companion file. But only if USERDB_AVOID_SHADOW
+ * is not set. After all, the privileged section kinda takes the role of the data from the
+ * shadow file, hence it makes sense to use the same flag here.
+ *
+ * The general assumption is that whoever provides these records makes the .user file
+ * world-readable, but the .privilege file readable to root and the assigned UID only. But we
+ * won't verify that here, as it would be too late. */
+
+ r = path_extract_directory(path, &d);
+ if (r < 0)
+ return r;
+
+ if (name) {
+ j = strjoin(d, "/", name, ".user-privileged");
+ if (!j)
+ return -ENOMEM;
+ } else {
+ assert(uid_is_valid(uid));
+ if (asprintf(&j, "%s/" UID_FMT ".user-privileged", d, uid) < 0)
+ return -ENOMEM;
+ }
+
+ r = json_parse_file(NULL, j, JSON_PARSE_SENSITIVE, &privileged_v, NULL, NULL);
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
+ have_privileged = false;
+ else if (r == -ENOENT)
+ have_privileged = true; /* if the privileged file doesn't exist, we are complete */
+ else if (r < 0)
+ return r;
+ else {
+ r = json_variant_merge_object(&v, privileged_v);
+ if (r < 0)
+ return r;
+
+ have_privileged = true;
+ }
+ }
+
+ u = user_record_new();
+ if (!u)
+ return -ENOMEM;
+
+ r = user_record_load(
+ u, v,
+ USER_RECORD_REQUIRE_REGULAR|
+ USER_RECORD_ALLOW_PER_MACHINE|
+ USER_RECORD_ALLOW_BINDING|
+ USER_RECORD_ALLOW_SIGNATURE|
+ (have_privileged ? USER_RECORD_ALLOW_PRIVILEGED : 0)|
+ USER_RECORD_PERMISSIVE);
+ if (r < 0)
+ return r;
+
+ if (name && !streq_ptr(name, u->user_name))
+ return -EINVAL;
+
+ if (uid_is_valid(uid) && uid != u->uid)
+ return -EINVAL;
+
+ u->incomplete = !have_privileged;
+
+ if (ret)
+ *ret = TAKE_PTR(u);
+
+ return 0;
+}
+
+int dropin_user_record_by_name(const char *name, const char *path, UserDBFlags flags, UserRecord **ret) {
+ _cleanup_free_ char *found_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(name);
+
+ if (path) {
+ f = fopen(path, "re");
+ if (!f)
+ return errno == ENOENT ? -ESRCH : -errno; /* We generally want ESRCH to indicate no such user */
+ } else {
+ const char *j;
+
+ j = strjoina(name, ".user");
+ if (!filename_is_valid(j)) /* Doesn't qualify as valid filename? Then it's definitely not provided as a drop-in */
+ return -ESRCH;
+
+ r = search_and_fopen_nulstr(j, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path);
+ if (r == -ENOENT)
+ return -ESRCH;
+ if (r < 0)
+ return r;
+
+ path = found_path;
+ }
+
+ return load_user(f, path, name, UID_INVALID, flags, ret);
+}
+
+int dropin_user_record_by_uid(uid_t uid, const char *path, UserDBFlags flags, UserRecord **ret) {
+ _cleanup_free_ char *found_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(uid_is_valid(uid));
+
+ if (path) {
+ f = fopen(path, "re");
+ if (!f)
+ return errno == ENOENT ? -ESRCH : -errno;
+ } else {
+ char buf[DECIMAL_STR_MAX(uid_t) + STRLEN(".user") + 1];
+
+ xsprintf(buf, UID_FMT ".user", uid);
+ /* Note that we don't bother to validate this as a filename, as this is generated from a decimal
+ * integer, i.e. is definitely OK as a filename */
+
+ r = search_and_fopen_nulstr(buf, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path);
+ if (r == -ENOENT)
+ return -ESRCH;
+ if (r < 0)
+ return r;
+
+ path = found_path;
+ }
+
+ return load_user(f, path, NULL, uid, flags, ret);
+}
+
+static int load_group(
+ FILE *f,
+ const char *path,
+ const char *name,
+ gid_t gid,
+ UserDBFlags flags,
+ GroupRecord **ret) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
+ _cleanup_(group_record_unrefp) GroupRecord *g = NULL;
+ bool have_privileged;
+ int r;
+
+ assert(f);
+
+ r = json_parse_file(f, path, 0, &v, NULL, NULL);
+ if (r < 0)
+ return r;
+
+ if (FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW) || !path || !(name || gid_is_valid(gid)))
+ have_privileged = false;
+ else {
+ _cleanup_(json_variant_unrefp) JsonVariant *privileged_v = NULL;
+ _cleanup_free_ char *d = NULL, *j = NULL;
+
+ r = path_extract_directory(path, &d);
+ if (r < 0)
+ return r;
+
+ if (name) {
+ j = strjoin(d, "/", name, ".group-privileged");
+ if (!j)
+ return -ENOMEM;
+ } else {
+ assert(gid_is_valid(gid));
+ if (asprintf(&j, "%s/" GID_FMT ".group-privileged", d, gid) < 0)
+ return -ENOMEM;
+ }
+
+ r = json_parse_file(NULL, j, JSON_PARSE_SENSITIVE, &privileged_v, NULL, NULL);
+ if (ERRNO_IS_NEG_PRIVILEGE(r))
+ have_privileged = false;
+ else if (r == -ENOENT)
+ have_privileged = true; /* if the privileged file doesn't exist, we are complete */
+ else if (r < 0)
+ return r;
+ else {
+ r = json_variant_merge_object(&v, privileged_v);
+ if (r < 0)
+ return r;
+
+ have_privileged = true;
+ }
+ }
+
+ g = group_record_new();
+ if (!g)
+ return -ENOMEM;
+
+ r = group_record_load(
+ g, v,
+ USER_RECORD_REQUIRE_REGULAR|
+ USER_RECORD_ALLOW_PER_MACHINE|
+ USER_RECORD_ALLOW_BINDING|
+ USER_RECORD_ALLOW_SIGNATURE|
+ (have_privileged ? USER_RECORD_ALLOW_PRIVILEGED : 0)|
+ USER_RECORD_PERMISSIVE);
+ if (r < 0)
+ return r;
+
+ if (name && !streq_ptr(name, g->group_name))
+ return -EINVAL;
+
+ if (gid_is_valid(gid) && gid != g->gid)
+ return -EINVAL;
+
+ g->incomplete = !have_privileged;
+
+ if (ret)
+ *ret = TAKE_PTR(g);
+
+ return 0;
+}
+
+int dropin_group_record_by_name(const char *name, const char *path, UserDBFlags flags, GroupRecord **ret) {
+ _cleanup_free_ char *found_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(name);
+
+ if (path) {
+ f = fopen(path, "re");
+ if (!f)
+ return errno == ENOENT ? -ESRCH : -errno;
+ } else {
+ const char *j;
+
+ j = strjoina(name, ".group");
+ if (!filename_is_valid(j)) /* Doesn't qualify as valid filename? Then it's definitely not provided as a drop-in */
+ return -ESRCH;
+
+ r = search_and_fopen_nulstr(j, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path);
+ if (r == -ENOENT)
+ return -ESRCH;
+ if (r < 0)
+ return r;
+
+ path = found_path;
+ }
+
+ return load_group(f, path, name, GID_INVALID, flags, ret);
+}
+
+int dropin_group_record_by_gid(gid_t gid, const char *path, UserDBFlags flags, GroupRecord **ret) {
+ _cleanup_free_ char *found_path = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ int r;
+
+ assert(gid_is_valid(gid));
+
+ if (path) {
+ f = fopen(path, "re");
+ if (!f)
+ return errno == ENOENT ? -ESRCH : -errno;
+ } else {
+ char buf[DECIMAL_STR_MAX(gid_t) + STRLEN(".group") + 1];
+
+ xsprintf(buf, GID_FMT ".group", gid);
+
+ r = search_and_fopen_nulstr(buf, "re", NULL, USERDB_DROPIN_DIR_NULSTR("userdb"), &f, &found_path);
+ if (r == -ENOENT)
+ return -ESRCH;
+ if (r < 0)
+ return r;
+
+ path = found_path;
+ }
+
+ return load_group(f, path, NULL, gid, flags, ret);
+}
diff --git a/src/shared/userdb-dropin.h b/src/shared/userdb-dropin.h
new file mode 100644
index 0000000..3bd1b9c
--- /dev/null
+++ b/src/shared/userdb-dropin.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "constants.h"
+#include "group-record.h"
+#include "user-record.h"
+#include "userdb.h"
+
+/* This could be put together with CONF_PATHS_NULSTR, with the exception of the /run/host/ part in the
+ * middle, which we use here, but not otherwise. */
+#define USERDB_DROPIN_DIR_NULSTR(n) \
+ "/etc/" n "\0" \
+ "/run/" n "\0" \
+ "/run/host/" n "\0" \
+ "/usr/local/lib/" n "\0" \
+ "/usr/lib/" n "\0"
+
+int dropin_user_record_by_name(const char *name, const char *path, UserDBFlags flags, UserRecord **ret);
+int dropin_user_record_by_uid(uid_t uid, const char *path, UserDBFlags flags, UserRecord **ret);
+
+int dropin_group_record_by_name(const char *name, const char *path, UserDBFlags flags, GroupRecord **ret);
+int dropin_group_record_by_gid(gid_t gid, const char *path, UserDBFlags flags, GroupRecord **ret);
diff --git a/src/shared/userdb.c b/src/shared/userdb.c
new file mode 100644
index 0000000..f60d48a
--- /dev/null
+++ b/src/shared/userdb.c
@@ -0,0 +1,1465 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <sys/auxv.h>
+
+#include "conf-files.h"
+#include "dirent-util.h"
+#include "dlfcn-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "format-util.h"
+#include "missing_syscall.h"
+#include "parse-util.h"
+#include "set.h"
+#include "socket-util.h"
+#include "strv.h"
+#include "user-record-nss.h"
+#include "user-util.h"
+#include "userdb-dropin.h"
+#include "userdb.h"
+#include "varlink.h"
+
+DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(link_hash_ops, void, trivial_hash_func, trivial_compare_func, Varlink, varlink_unref);
+
+typedef enum LookupWhat {
+ LOOKUP_USER,
+ LOOKUP_GROUP,
+ LOOKUP_MEMBERSHIP,
+ _LOOKUP_WHAT_MAX,
+} LookupWhat;
+
+struct UserDBIterator {
+ LookupWhat what;
+ UserDBFlags flags;
+ Set *links;
+ bool nss_covered:1;
+ bool nss_iterating:1;
+ bool dropin_covered:1;
+ bool synthesize_root:1;
+ bool synthesize_nobody:1;
+ bool nss_systemd_blocked:1;
+ char **dropins;
+ size_t current_dropin;
+ int error;
+ unsigned n_found;
+ sd_event *event;
+ UserRecord *found_user; /* when .what == LOOKUP_USER */
+ GroupRecord *found_group; /* when .what == LOOKUP_GROUP */
+
+ char *found_user_name, *found_group_name; /* when .what == LOOKUP_MEMBERSHIP */
+ char **members_of_group;
+ size_t index_members_of_group;
+ char *filter_user_name, *filter_group_name;
+};
+
+UserDBIterator* userdb_iterator_free(UserDBIterator *iterator) {
+ if (!iterator)
+ return NULL;
+
+ set_free(iterator->links);
+ strv_free(iterator->dropins);
+
+ switch (iterator->what) {
+
+ case LOOKUP_USER:
+ user_record_unref(iterator->found_user);
+
+ if (iterator->nss_iterating)
+ endpwent();
+
+ break;
+
+ case LOOKUP_GROUP:
+ group_record_unref(iterator->found_group);
+
+ if (iterator->nss_iterating)
+ endgrent();
+
+ break;
+
+ case LOOKUP_MEMBERSHIP:
+ free(iterator->found_user_name);
+ free(iterator->found_group_name);
+ strv_free(iterator->members_of_group);
+ free(iterator->filter_user_name);
+ free(iterator->filter_group_name);
+
+ if (iterator->nss_iterating)
+ endgrent();
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ sd_event_unref(iterator->event);
+
+ if (iterator->nss_systemd_blocked)
+ assert_se(userdb_block_nss_systemd(false) >= 0);
+
+ return mfree(iterator);
+}
+
+static UserDBIterator* userdb_iterator_new(LookupWhat what, UserDBFlags flags) {
+ UserDBIterator *i;
+
+ assert(what >= 0);
+ assert(what < _LOOKUP_WHAT_MAX);
+
+ i = new(UserDBIterator, 1);
+ if (!i)
+ return NULL;
+
+ *i = (UserDBIterator) {
+ .what = what,
+ .flags = flags,
+ .synthesize_root = !FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE),
+ .synthesize_nobody = !FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE),
+ };
+
+ return i;
+}
+
+static int userdb_iterator_block_nss_systemd(UserDBIterator *iterator) {
+ int r;
+
+ assert(iterator);
+
+ if (iterator->nss_systemd_blocked)
+ return 0;
+
+ r = userdb_block_nss_systemd(true);
+ if (r < 0)
+ return r;
+
+ iterator->nss_systemd_blocked = true;
+ return 1;
+}
+
+struct user_group_data {
+ JsonVariant *record;
+ bool incomplete;
+};
+
+static void user_group_data_done(struct user_group_data *d) {
+ json_variant_unref(d->record);
+}
+
+struct membership_data {
+ char *user_name;
+ char *group_name;
+};
+
+static void membership_data_done(struct membership_data *d) {
+ free(d->user_name);
+ free(d->group_name);
+}
+
+static int userdb_on_query_reply(
+ Varlink *link,
+ JsonVariant *parameters,
+ const char *error_id,
+ VarlinkReplyFlags flags,
+ void *userdata) {
+
+ UserDBIterator *iterator = ASSERT_PTR(userdata);
+ int r;
+
+ if (error_id) {
+ log_debug("Got lookup error: %s", error_id);
+
+ if (STR_IN_SET(error_id,
+ "io.systemd.UserDatabase.NoRecordFound",
+ "io.systemd.UserDatabase.ConflictingRecordFound"))
+ r = -ESRCH;
+ else if (streq(error_id, "io.systemd.UserDatabase.ServiceNotAvailable"))
+ r = -EHOSTDOWN;
+ else if (streq(error_id, "io.systemd.UserDatabase.EnumerationNotSupported"))
+ r = -EOPNOTSUPP;
+ else if (streq(error_id, VARLINK_ERROR_TIMEOUT))
+ r = -ETIMEDOUT;
+ else
+ r = -EIO;
+
+ goto finish;
+ }
+
+ switch (iterator->what) {
+
+ case LOOKUP_USER: {
+ _cleanup_(user_group_data_done) struct user_group_data user_data = {};
+
+ static const JsonDispatch dispatch_table[] = {
+ { "record", _JSON_VARIANT_TYPE_INVALID, json_dispatch_variant, offsetof(struct user_group_data, record), 0 },
+ { "incomplete", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct user_group_data, incomplete), 0 },
+ {}
+ };
+ _cleanup_(user_record_unrefp) UserRecord *hr = NULL;
+
+ assert_se(!iterator->found_user);
+
+ r = json_dispatch(parameters, dispatch_table, 0, &user_data);
+ if (r < 0)
+ goto finish;
+
+ if (!user_data.record) {
+ r = log_debug_errno(SYNTHETIC_ERRNO(EIO), "Reply is missing record key");
+ goto finish;
+ }
+
+ hr = user_record_new();
+ if (!hr) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ r = user_record_load(hr, user_data.record, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE);
+ if (r < 0)
+ goto finish;
+
+ if (!hr->service) {
+ r = log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "User record does not carry service information, refusing.");
+ goto finish;
+ }
+
+ hr->incomplete = user_data.incomplete;
+
+ /* We match the root user by the name since the name is our primary key. We match the nobody
+ * use by UID though, since the name might differ on OSes */
+ if (streq_ptr(hr->user_name, "root"))
+ iterator->synthesize_root = false;
+ if (hr->uid == UID_NOBODY)
+ iterator->synthesize_nobody = false;
+
+ iterator->found_user = TAKE_PTR(hr);
+ iterator->n_found++;
+
+ /* More stuff coming? then let's just exit cleanly here */
+ if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ return 0;
+
+ /* Otherwise, let's remove this link and exit cleanly then */
+ r = 0;
+ goto finish;
+ }
+
+ case LOOKUP_GROUP: {
+ _cleanup_(user_group_data_done) struct user_group_data group_data = {};
+
+ static const JsonDispatch dispatch_table[] = {
+ { "record", _JSON_VARIANT_TYPE_INVALID, json_dispatch_variant, offsetof(struct user_group_data, record), 0 },
+ { "incomplete", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct user_group_data, incomplete), 0 },
+ {}
+ };
+ _cleanup_(group_record_unrefp) GroupRecord *g = NULL;
+
+ assert_se(!iterator->found_group);
+
+ r = json_dispatch(parameters, dispatch_table, 0, &group_data);
+ if (r < 0)
+ goto finish;
+
+ if (!group_data.record) {
+ r = log_debug_errno(SYNTHETIC_ERRNO(EIO), "Reply is missing record key");
+ goto finish;
+ }
+
+ g = group_record_new();
+ if (!g) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ r = group_record_load(g, group_data.record, USER_RECORD_LOAD_REFUSE_SECRET|USER_RECORD_PERMISSIVE);
+ if (r < 0)
+ goto finish;
+
+ if (!g->service) {
+ r = log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Group record does not carry service information, refusing.");
+ goto finish;
+ }
+
+ g->incomplete = group_data.incomplete;
+
+ if (streq_ptr(g->group_name, "root"))
+ iterator->synthesize_root = false;
+ if (g->gid == GID_NOBODY)
+ iterator->synthesize_nobody = false;
+
+ iterator->found_group = TAKE_PTR(g);
+ iterator->n_found++;
+
+ if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ return 0;
+
+ r = 0;
+ goto finish;
+ }
+
+ case LOOKUP_MEMBERSHIP: {
+ _cleanup_(membership_data_done) struct membership_data membership_data = {};
+
+ static const JsonDispatch dispatch_table[] = {
+ { "userName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(struct membership_data, user_name), JSON_RELAX },
+ { "groupName", JSON_VARIANT_STRING, json_dispatch_user_group_name, offsetof(struct membership_data, group_name), JSON_RELAX },
+ {}
+ };
+
+ assert(!iterator->found_user_name);
+ assert(!iterator->found_group_name);
+
+ r = json_dispatch(parameters, dispatch_table, 0, &membership_data);
+ if (r < 0)
+ goto finish;
+
+ iterator->found_user_name = TAKE_PTR(membership_data.user_name);
+ iterator->found_group_name = TAKE_PTR(membership_data.group_name);
+ iterator->n_found++;
+
+ if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ return 0;
+
+ r = 0;
+ goto finish;
+ }
+
+ default:
+ assert_not_reached();
+ }
+
+finish:
+ /* If we got one ESRCH, let that win. This way when we do a wild dump we won't be tripped up by bad
+ * errors if at least one connection ended cleanly */
+ if (r == -ESRCH || iterator->error == 0)
+ iterator->error = -r;
+
+ assert_se(set_remove(iterator->links, link) == link);
+ link = varlink_unref(link);
+ return 0;
+}
+
+static int userdb_connect(
+ UserDBIterator *iterator,
+ const char *path,
+ const char *method,
+ bool more,
+ JsonVariant *query) {
+
+ _cleanup_(varlink_unrefp) Varlink *vl = NULL;
+ int r;
+
+ assert(iterator);
+ assert(path);
+ assert(method);
+
+ r = varlink_connect_address(&vl, path);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to connect to %s: %m", path);
+
+ varlink_set_userdata(vl, iterator);
+
+ if (!iterator->event) {
+ r = sd_event_new(&iterator->event);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to allocate event loop: %m");
+ }
+
+ r = varlink_attach_event(vl, iterator->event, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to attach varlink connection to event loop: %m");
+
+ (void) varlink_set_description(vl, path);
+
+ r = varlink_bind_reply(vl, userdb_on_query_reply);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to bind reply callback: %m");
+
+ if (more)
+ r = varlink_observe(vl, method, query);
+ else
+ r = varlink_invoke(vl, method, query);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to invoke varlink method: %m");
+
+ r = set_ensure_consume(&iterator->links, &link_hash_ops, TAKE_PTR(vl));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add varlink connection to set: %m");
+ return r;
+}
+
+static int userdb_start_query(
+ UserDBIterator *iterator,
+ const char *method,
+ bool more,
+ JsonVariant *query,
+ UserDBFlags flags) {
+
+ _cleanup_strv_free_ char **except = NULL, **only = NULL;
+ _cleanup_closedir_ DIR *d = NULL;
+ const char *e;
+ int r, ret = 0;
+
+ assert(iterator);
+ assert(method);
+
+ if (FLAGS_SET(flags, USERDB_EXCLUDE_VARLINK))
+ return -ENOLINK;
+
+ e = getenv("SYSTEMD_BYPASS_USERDB");
+ if (e) {
+ r = parse_boolean(e);
+ if (r > 0)
+ return -ENOLINK;
+ if (r < 0) {
+ except = strv_split(e, ":");
+ if (!except)
+ return -ENOMEM;
+ }
+ }
+
+ e = getenv("SYSTEMD_ONLY_USERDB");
+ if (e) {
+ only = strv_split(e, ":");
+ if (!only)
+ return -ENOMEM;
+ }
+
+ /* First, let's talk to the multiplexer, if we can */
+ if ((flags & (USERDB_AVOID_MULTIPLEXER|USERDB_EXCLUDE_DYNAMIC_USER|USERDB_EXCLUDE_NSS|USERDB_EXCLUDE_DROPIN|USERDB_DONT_SYNTHESIZE)) == 0 &&
+ !strv_contains(except, "io.systemd.Multiplexer") &&
+ (!only || strv_contains(only, "io.systemd.Multiplexer"))) {
+ _cleanup_(json_variant_unrefp) JsonVariant *patched_query = json_variant_ref(query);
+
+ r = json_variant_set_field_string(&patched_query, "service", "io.systemd.Multiplexer");
+ if (r < 0)
+ return log_debug_errno(r, "Unable to set service JSON field: %m");
+
+ r = userdb_connect(iterator, "/run/systemd/userdb/io.systemd.Multiplexer", method, more, patched_query);
+ if (r >= 0) {
+ iterator->nss_covered = true; /* The multiplexer does NSS */
+ iterator->dropin_covered = true; /* It also handles drop-in stuff */
+ return 0;
+ }
+ }
+
+ d = opendir("/run/systemd/userdb/");
+ if (!d) {
+ if (errno == ENOENT)
+ return -ESRCH;
+
+ return -errno;
+ }
+
+ FOREACH_DIRENT(de, d, return -errno) {
+ _cleanup_(json_variant_unrefp) JsonVariant *patched_query = NULL;
+ _cleanup_free_ char *p = NULL;
+ bool is_nss, is_dropin;
+
+ if (streq(de->d_name, "io.systemd.Multiplexer")) /* We already tried this above, don't try this again */
+ continue;
+
+ if (FLAGS_SET(flags, USERDB_EXCLUDE_DYNAMIC_USER) &&
+ streq(de->d_name, "io.systemd.DynamicUser"))
+ continue;
+
+ /* Avoid NSS if this is requested. Note that we also skip NSS when we were asked to skip the
+ * multiplexer, since in that case it's safer to do NSS in the client side emulation below
+ * (and when we run as part of systemd-userdbd.service we don't want to talk to ourselves
+ * anyway). */
+ is_nss = streq(de->d_name, "io.systemd.NameServiceSwitch");
+ if ((flags & (USERDB_EXCLUDE_NSS|USERDB_AVOID_MULTIPLEXER)) && is_nss)
+ continue;
+
+ /* Similar for the drop-in service */
+ is_dropin = streq(de->d_name, "io.systemd.DropIn");
+ if ((flags & (USERDB_EXCLUDE_DROPIN|USERDB_AVOID_MULTIPLEXER)) && is_dropin)
+ continue;
+
+ if (strv_contains(except, de->d_name))
+ continue;
+
+ if (only && !strv_contains(only, de->d_name))
+ continue;
+
+ p = path_join("/run/systemd/userdb/", de->d_name);
+ if (!p)
+ return -ENOMEM;
+
+ patched_query = json_variant_ref(query);
+ r = json_variant_set_field_string(&patched_query, "service", de->d_name);
+ if (r < 0)
+ return log_debug_errno(r, "Unable to set service JSON field: %m");
+
+ r = userdb_connect(iterator, p, method, more, patched_query);
+ if (is_nss && r >= 0) /* Turn off fallback NSS + dropin if we found the NSS/dropin service
+ * and could connect to it */
+ iterator->nss_covered = true;
+ if (is_dropin && r >= 0)
+ iterator->dropin_covered = true;
+
+ if (ret == 0 && r < 0)
+ ret = r;
+ }
+
+ if (set_isempty(iterator->links))
+ return ret < 0 ? ret : -ESRCH; /* propagate last error we saw if we couldn't connect to anything. */
+
+ /* We connected to some services, in this case, ignore the ones we failed on */
+ return 0;
+}
+
+static int userdb_process(
+ UserDBIterator *iterator,
+ UserRecord **ret_user_record,
+ GroupRecord **ret_group_record,
+ char **ret_user_name,
+ char **ret_group_name) {
+
+ int r;
+
+ assert(iterator);
+
+ for (;;) {
+ if (iterator->what == LOOKUP_USER && iterator->found_user) {
+ if (ret_user_record)
+ *ret_user_record = TAKE_PTR(iterator->found_user);
+ else
+ iterator->found_user = user_record_unref(iterator->found_user);
+
+ if (ret_group_record)
+ *ret_group_record = NULL;
+ if (ret_user_name)
+ *ret_user_name = NULL;
+ if (ret_group_name)
+ *ret_group_name = NULL;
+
+ return 0;
+ }
+
+ if (iterator->what == LOOKUP_GROUP && iterator->found_group) {
+ if (ret_group_record)
+ *ret_group_record = TAKE_PTR(iterator->found_group);
+ else
+ iterator->found_group = group_record_unref(iterator->found_group);
+
+ if (ret_user_record)
+ *ret_user_record = NULL;
+ if (ret_user_name)
+ *ret_user_name = NULL;
+ if (ret_group_name)
+ *ret_group_name = NULL;
+
+ return 0;
+ }
+
+ if (iterator->what == LOOKUP_MEMBERSHIP && iterator->found_user_name && iterator->found_group_name) {
+ if (ret_user_name)
+ *ret_user_name = TAKE_PTR(iterator->found_user_name);
+ else
+ iterator->found_user_name = mfree(iterator->found_user_name);
+
+ if (ret_group_name)
+ *ret_group_name = TAKE_PTR(iterator->found_group_name);
+ else
+ iterator->found_group_name = mfree(iterator->found_group_name);
+
+ if (ret_user_record)
+ *ret_user_record = NULL;
+ if (ret_group_record)
+ *ret_group_record = NULL;
+
+ return 0;
+ }
+
+ if (set_isempty(iterator->links)) {
+ if (iterator->error == 0)
+ return -ESRCH;
+
+ return -abs(iterator->error);
+ }
+
+ if (!iterator->event)
+ return -ESRCH;
+
+ r = sd_event_run(iterator->event, UINT64_MAX);
+ if (r < 0)
+ return r;
+ }
+}
+
+static int synthetic_root_user_build(UserRecord **ret) {
+ return user_record_build(
+ ret,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_CONST_STRING("root")),
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(0)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(0)),
+ JSON_BUILD_PAIR("homeDirectory", JSON_BUILD_CONST_STRING("/root")),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic"))));
+}
+
+static int synthetic_nobody_user_build(UserRecord **ret) {
+ return user_record_build(
+ ret,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("userName", JSON_BUILD_CONST_STRING(NOBODY_USER_NAME)),
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(UID_NOBODY)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)),
+ JSON_BUILD_PAIR("shell", JSON_BUILD_CONST_STRING(NOLOGIN)),
+ JSON_BUILD_PAIR("locked", JSON_BUILD_BOOLEAN(true)),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic"))));
+}
+
+int userdb_by_name(const char *name, UserDBFlags flags, UserRecord **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *query = NULL;
+ int r;
+
+ if (!valid_user_group_name(name, VALID_USER_RELAX))
+ return -EINVAL;
+
+ r = json_build(&query, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name))));
+ if (r < 0)
+ return r;
+
+ iterator = userdb_iterator_new(LOOKUP_USER, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetUserRecord", false, query, flags);
+ if (r >= 0) {
+ r = userdb_process(iterator, ret, NULL, NULL, NULL);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !iterator->dropin_covered) {
+ r = dropin_user_record_by_name(name, NULL, flags, ret);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !iterator->nss_covered) {
+ /* Make sure the NSS lookup doesn't recurse back to us. */
+
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r >= 0) {
+ /* Client-side NSS fallback */
+ r = nss_user_record_by_name(name, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret);
+ if (r >= 0)
+ return r;
+ }
+ }
+
+ if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) {
+ if (streq(name, "root"))
+ return synthetic_root_user_build(ret);
+
+ if (streq(name, NOBODY_USER_NAME) && synthesize_nobody())
+ return synthetic_nobody_user_build(ret);
+ }
+
+ return r;
+}
+
+int userdb_by_uid(uid_t uid, UserDBFlags flags, UserRecord **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *query = NULL;
+ int r;
+
+ if (!uid_is_valid(uid))
+ return -EINVAL;
+
+ r = json_build(&query, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("uid", JSON_BUILD_UNSIGNED(uid))));
+ if (r < 0)
+ return r;
+
+ iterator = userdb_iterator_new(LOOKUP_USER, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetUserRecord", false, query, flags);
+ if (r >= 0) {
+ r = userdb_process(iterator, ret, NULL, NULL, NULL);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !iterator->dropin_covered) {
+ r = dropin_user_record_by_uid(uid, NULL, flags, ret);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !iterator->nss_covered) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r >= 0) {
+ /* Client-side NSS fallback */
+ r = nss_user_record_by_uid(uid, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret);
+ if (r >= 0)
+ return r;
+ }
+ }
+
+ if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) {
+ if (uid == 0)
+ return synthetic_root_user_build(ret);
+
+ if (uid == UID_NOBODY && synthesize_nobody())
+ return synthetic_nobody_user_build(ret);
+ }
+
+ return r;
+}
+
+int userdb_all(UserDBFlags flags, UserDBIterator **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ int r, qr;
+
+ assert(ret);
+
+ iterator = userdb_iterator_new(LOOKUP_USER, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetUserRecord", true, NULL, flags);
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r < 0)
+ return r;
+
+ setpwent();
+ iterator->nss_iterating = true;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) {
+ r = conf_files_list_nulstr(
+ &iterator->dropins,
+ ".user",
+ NULL,
+ CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED,
+ USERDB_DROPIN_DIR_NULSTR("userdb"));
+ if (r < 0)
+ log_debug_errno(r, "Failed to find user drop-ins, ignoring: %m");
+ }
+
+ /* propagate IPC error, but only if there are no drop-ins */
+ if (qr < 0 &&
+ !iterator->nss_iterating &&
+ strv_isempty(iterator->dropins))
+ return qr;
+
+ *ret = TAKE_PTR(iterator);
+ return 0;
+}
+
+int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret) {
+ int r;
+
+ assert(iterator);
+ assert(iterator->what == LOOKUP_USER);
+
+ if (iterator->nss_iterating) {
+ struct passwd *pw;
+
+ /* If NSS isn't covered elsewhere, let's iterate through it first, since it probably contains
+ * the more traditional sources, which are probably good to show first. */
+
+ pw = getpwent();
+ if (pw) {
+ _cleanup_free_ char *buffer = NULL;
+ bool incomplete = false;
+ struct spwd spwd;
+
+ if (streq_ptr(pw->pw_name, "root"))
+ iterator->synthesize_root = false;
+ if (pw->pw_uid == UID_NOBODY)
+ iterator->synthesize_nobody = false;
+
+ if (!FLAGS_SET(iterator->flags, USERDB_SUPPRESS_SHADOW)) {
+ r = nss_spwd_for_passwd(pw, &spwd, &buffer);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to acquire shadow entry for user %s, ignoring: %m", pw->pw_name);
+ incomplete = ERRNO_IS_PRIVILEGE(r);
+ }
+ } else {
+ r = -EUCLEAN;
+ incomplete = true;
+ }
+
+ r = nss_passwd_to_user_record(pw, r >= 0 ? &spwd : NULL, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->incomplete = incomplete;
+
+ iterator->n_found++;
+ return r;
+ }
+
+ if (errno != 0)
+ log_debug_errno(errno, "Failure to iterate NSS user database, ignoring: %m");
+
+ iterator->nss_iterating = false;
+ endpwent();
+ }
+
+ for (; iterator->dropins && iterator->dropins[iterator->current_dropin]; iterator->current_dropin++) {
+ const char *i = iterator->dropins[iterator->current_dropin];
+ _cleanup_free_ char *fn = NULL;
+ uid_t uid;
+ char *e;
+
+ /* Next, let's add in the static drop-ins, which are quick to retrieve */
+
+ r = path_extract_filename(i, &fn);
+ if (r < 0)
+ return r;
+
+ e = endswith(fn, ".user"); /* not actually a .user file? Then skip to next */
+ if (!e)
+ continue;
+
+ *e = 0; /* Chop off suffix */
+
+ if (parse_uid(fn, &uid) < 0) /* not a UID .user file? Then skip to next */
+ continue;
+
+ r = dropin_user_record_by_uid(uid, i, iterator->flags, ret);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to parse user record for UID " UID_FMT ", ignoring: %m", uid);
+ continue; /* If we failed to parse this record, let's suppress it from enumeration,
+ * and continue with the next record. Maybe someone is dropping it files
+ * and only partially wrote this one. */
+ }
+
+ iterator->current_dropin++; /* make sure on the next call of userdb_iterator_get() we continue with the next dropin */
+ iterator->n_found++;
+ return 0;
+ }
+
+ /* Then, let's return the users provided by varlink IPC */
+ r = userdb_process(iterator, ret, NULL, NULL, NULL);
+ if (r < 0) {
+
+ /* Finally, synthesize root + nobody if not done yet */
+ if (iterator->synthesize_root) {
+ iterator->synthesize_root = false;
+ iterator->n_found++;
+ return synthetic_root_user_build(ret);
+ }
+
+ if (iterator->synthesize_nobody) {
+ iterator->synthesize_nobody = false;
+ iterator->n_found++;
+ return synthetic_nobody_user_build(ret);
+ }
+
+ /* if we found at least one entry, then ignore errors and indicate that we reached the end */
+ if (iterator->n_found > 0)
+ return -ESRCH;
+ }
+
+ return r;
+}
+
+static int synthetic_root_group_build(GroupRecord **ret) {
+ return group_record_build(
+ ret,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("groupName", JSON_BUILD_CONST_STRING("root")),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(0)),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic"))));
+}
+
+static int synthetic_nobody_group_build(GroupRecord **ret) {
+ return group_record_build(
+ ret,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("groupName", JSON_BUILD_CONST_STRING(NOBODY_GROUP_NAME)),
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(GID_NOBODY)),
+ JSON_BUILD_PAIR("disposition", JSON_BUILD_CONST_STRING("intrinsic"))));
+}
+
+int groupdb_by_name(const char *name, UserDBFlags flags, GroupRecord **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *query = NULL;
+ int r;
+
+ if (!valid_user_group_name(name, VALID_USER_RELAX))
+ return -EINVAL;
+
+ r = json_build(&query, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name))));
+ if (r < 0)
+ return r;
+
+ iterator = userdb_iterator_new(LOOKUP_GROUP, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetGroupRecord", false, query, flags);
+ if (r >= 0) {
+ r = userdb_process(iterator, NULL, ret, NULL, NULL);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !(iterator && iterator->dropin_covered)) {
+ r = dropin_group_record_by_name(name, NULL, flags, ret);
+ if (r >= 0)
+ return r;
+ }
+
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !(iterator && iterator->nss_covered)) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r >= 0) {
+ r = nss_group_record_by_name(name, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret);
+ if (r >= 0)
+ return r;
+ }
+ }
+
+ if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) {
+ if (streq(name, "root"))
+ return synthetic_root_group_build(ret);
+
+ if (streq(name, NOBODY_GROUP_NAME) && synthesize_nobody())
+ return synthetic_nobody_group_build(ret);
+ }
+
+ return r;
+}
+
+int groupdb_by_gid(gid_t gid, UserDBFlags flags, GroupRecord **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *query = NULL;
+ int r;
+
+ if (!gid_is_valid(gid))
+ return -EINVAL;
+
+ r = json_build(&query, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("gid", JSON_BUILD_UNSIGNED(gid))));
+ if (r < 0)
+ return r;
+
+ iterator = userdb_iterator_new(LOOKUP_GROUP, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ r = userdb_start_query(iterator, "io.systemd.UserDatabase.GetGroupRecord", false, query, flags);
+ if (r >= 0) {
+ r = userdb_process(iterator, NULL, ret, NULL, NULL);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && !(iterator && iterator->dropin_covered)) {
+ r = dropin_group_record_by_gid(gid, NULL, flags, ret);
+ if (r >= 0)
+ return r;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && !(iterator && iterator->nss_covered)) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r >= 0) {
+ r = nss_group_record_by_gid(gid, !FLAGS_SET(flags, USERDB_SUPPRESS_SHADOW), ret);
+ if (r >= 0)
+ return r;
+ }
+ }
+
+ if (!FLAGS_SET(flags, USERDB_DONT_SYNTHESIZE)) {
+ if (gid == 0)
+ return synthetic_root_group_build(ret);
+
+ if (gid == GID_NOBODY && synthesize_nobody())
+ return synthetic_nobody_group_build(ret);
+ }
+
+ return r;
+}
+
+int groupdb_all(UserDBFlags flags, UserDBIterator **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ int r, qr;
+
+ assert(ret);
+
+ iterator = userdb_iterator_new(LOOKUP_GROUP, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetGroupRecord", true, NULL, flags);
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r < 0)
+ return r;
+
+ setgrent();
+ iterator->nss_iterating = true;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered)) {
+ r = conf_files_list_nulstr(
+ &iterator->dropins,
+ ".group",
+ NULL,
+ CONF_FILES_REGULAR|CONF_FILES_FILTER_MASKED,
+ USERDB_DROPIN_DIR_NULSTR("userdb"));
+ if (r < 0)
+ log_debug_errno(r, "Failed to find group drop-ins, ignoring: %m");
+ }
+
+ if (qr < 0 &&
+ !iterator->nss_iterating &&
+ strv_isempty(iterator->dropins))
+ return qr;
+
+ *ret = TAKE_PTR(iterator);
+ return 0;
+}
+
+int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret) {
+ int r;
+
+ assert(iterator);
+ assert(iterator->what == LOOKUP_GROUP);
+
+ if (iterator->nss_iterating) {
+ struct group *gr;
+
+ errno = 0;
+ gr = getgrent();
+ if (gr) {
+ _cleanup_free_ char *buffer = NULL;
+ bool incomplete = false;
+ struct sgrp sgrp;
+
+ if (streq_ptr(gr->gr_name, "root"))
+ iterator->synthesize_root = false;
+ if (gr->gr_gid == GID_NOBODY)
+ iterator->synthesize_nobody = false;
+
+ if (!FLAGS_SET(iterator->flags, USERDB_SUPPRESS_SHADOW)) {
+ r = nss_sgrp_for_group(gr, &sgrp, &buffer);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to acquire shadow entry for group %s, ignoring: %m", gr->gr_name);
+ incomplete = ERRNO_IS_PRIVILEGE(r);
+ }
+ } else {
+ r = -EUCLEAN;
+ incomplete = true;
+ }
+
+ r = nss_group_to_group_record(gr, r >= 0 ? &sgrp : NULL, ret);
+ if (r < 0)
+ return r;
+
+ if (ret)
+ (*ret)->incomplete = incomplete;
+
+ iterator->n_found++;
+ return r;
+ }
+
+ if (errno != 0)
+ log_debug_errno(errno, "Failure to iterate NSS group database, ignoring: %m");
+
+ iterator->nss_iterating = false;
+ endgrent();
+ }
+
+ for (; iterator->dropins && iterator->dropins[iterator->current_dropin]; iterator->current_dropin++) {
+ const char *i = iterator->dropins[iterator->current_dropin];
+ _cleanup_free_ char *fn = NULL;
+ gid_t gid;
+ char *e;
+
+ r = path_extract_filename(i, &fn);
+ if (r < 0)
+ return r;
+
+ e = endswith(fn, ".group");
+ if (!e)
+ continue;
+
+ *e = 0; /* Chop off suffix */
+
+ if (parse_gid(fn, &gid) < 0)
+ continue;
+
+ r = dropin_group_record_by_gid(gid, i, iterator->flags, ret);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to parse group record for GID " GID_FMT ", ignoring: %m", gid);
+ continue;
+ }
+
+ iterator->current_dropin++;
+ iterator->n_found++;
+ return 0;
+ }
+
+ r = userdb_process(iterator, NULL, ret, NULL, NULL);
+ if (r < 0) {
+ if (iterator->synthesize_root) {
+ iterator->synthesize_root = false;
+ iterator->n_found++;
+ return synthetic_root_group_build(ret);
+ }
+
+ if (iterator->synthesize_nobody) {
+ iterator->synthesize_nobody = false;
+ iterator->n_found++;
+ return synthetic_nobody_group_build(ret);
+ }
+
+ /* if we found at least one entry, then ignore errors and indicate that we reached the end */
+ if (iterator->n_found > 0)
+ return -ESRCH;
+ }
+
+ return r;
+}
+
+static void discover_membership_dropins(UserDBIterator *i, UserDBFlags flags) {
+ int r;
+
+ r = conf_files_list_nulstr(
+ &i->dropins,
+ ".membership",
+ NULL,
+ CONF_FILES_REGULAR|CONF_FILES_BASENAME|CONF_FILES_FILTER_MASKED,
+ USERDB_DROPIN_DIR_NULSTR("userdb"));
+ if (r < 0)
+ log_debug_errno(r, "Failed to find membership drop-ins, ignoring: %m");
+}
+
+int membershipdb_by_user(const char *name, UserDBFlags flags, UserDBIterator **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *query = NULL;
+ int r, qr;
+
+ assert(ret);
+
+ if (!valid_user_group_name(name, VALID_USER_RELAX))
+ return -EINVAL;
+
+ r = json_build(&query, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("userName", JSON_BUILD_STRING(name))));
+ if (r < 0)
+ return r;
+
+ iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ iterator->filter_user_name = strdup(name);
+ if (!iterator->filter_user_name)
+ return -ENOMEM;
+
+ qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetMemberships", true, query, flags);
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r < 0)
+ return r;
+
+ setgrent();
+ iterator->nss_iterating = true;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered))
+ discover_membership_dropins(iterator, flags);
+
+ if (qr < 0 &&
+ !iterator->nss_iterating &&
+ strv_isempty(iterator->dropins))
+ return qr;
+
+ *ret = TAKE_PTR(iterator);
+ return 0;
+}
+
+int membershipdb_by_group(const char *name, UserDBFlags flags, UserDBIterator **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *query = NULL;
+ int r, qr;
+
+ assert(ret);
+
+ if (!valid_user_group_name(name, VALID_USER_RELAX))
+ return -EINVAL;
+
+ r = json_build(&query, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("groupName", JSON_BUILD_STRING(name))));
+ if (r < 0)
+ return r;
+
+ iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ iterator->filter_group_name = strdup(name);
+ if (!iterator->filter_group_name)
+ return -ENOMEM;
+
+ qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetMemberships", true, query, flags);
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) {
+ _cleanup_(group_record_unrefp) GroupRecord *gr = NULL;
+
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r < 0)
+ return r;
+
+ /* We ignore all errors here, since the group might be defined by a userdb native service, and we queried them already above. */
+ (void) nss_group_record_by_name(name, false, &gr);
+ if (gr) {
+ iterator->members_of_group = strv_copy(gr->members);
+ if (!iterator->members_of_group)
+ return -ENOMEM;
+
+ iterator->index_members_of_group = 0;
+
+ iterator->found_group_name = strdup(name);
+ if (!iterator->found_group_name)
+ return -ENOMEM;
+ }
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered))
+ discover_membership_dropins(iterator, flags);
+
+ if (qr < 0 &&
+ strv_isempty(iterator->members_of_group) &&
+ strv_isempty(iterator->dropins))
+ return qr;
+
+ *ret = TAKE_PTR(iterator);
+ return 0;
+}
+
+int membershipdb_all(UserDBFlags flags, UserDBIterator **ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ int r, qr;
+
+ assert(ret);
+
+ iterator = userdb_iterator_new(LOOKUP_MEMBERSHIP, flags);
+ if (!iterator)
+ return -ENOMEM;
+
+ qr = userdb_start_query(iterator, "io.systemd.UserDatabase.GetMemberships", true, NULL, flags);
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_NSS) && (qr < 0 || !iterator->nss_covered)) {
+ r = userdb_iterator_block_nss_systemd(iterator);
+ if (r < 0)
+ return r;
+
+ setgrent();
+ iterator->nss_iterating = true;
+ }
+
+ if (!FLAGS_SET(flags, USERDB_EXCLUDE_DROPIN) && (qr < 0 || !iterator->dropin_covered))
+ discover_membership_dropins(iterator, flags);
+
+ if (qr < 0 &&
+ !iterator->nss_iterating &&
+ strv_isempty(iterator->dropins))
+ return qr;
+
+ *ret = TAKE_PTR(iterator);
+ return 0;
+}
+
+int membershipdb_iterator_get(
+ UserDBIterator *iterator,
+ char **ret_user,
+ char **ret_group) {
+
+ int r;
+
+ assert(iterator);
+
+ for (;;) {
+ /* If we are iterating through NSS acquire a new group entry if we haven't acquired one yet. */
+ if (!iterator->members_of_group) {
+ struct group *g;
+
+ if (!iterator->nss_iterating)
+ break;
+
+ assert(!iterator->found_user_name);
+ do {
+ errno = 0;
+ g = getgrent();
+ if (!g) {
+ if (errno != 0)
+ log_debug_errno(errno, "Failure during NSS group iteration, ignoring: %m");
+ break;
+ }
+
+ } while (iterator->filter_user_name ? !strv_contains(g->gr_mem, iterator->filter_user_name) :
+ strv_isempty(g->gr_mem));
+
+ if (g) {
+ r = free_and_strdup(&iterator->found_group_name, g->gr_name);
+ if (r < 0)
+ return r;
+
+ if (iterator->filter_user_name)
+ iterator->members_of_group = strv_new(iterator->filter_user_name);
+ else
+ iterator->members_of_group = strv_copy(g->gr_mem);
+ if (!iterator->members_of_group)
+ return -ENOMEM;
+
+ iterator->index_members_of_group = 0;
+ } else {
+ iterator->nss_iterating = false;
+ endgrent();
+ break;
+ }
+ }
+
+ assert(iterator->found_group_name);
+ assert(iterator->members_of_group);
+ assert(!iterator->found_user_name);
+
+ if (iterator->members_of_group[iterator->index_members_of_group]) {
+ _cleanup_free_ char *cu = NULL, *cg = NULL;
+
+ if (ret_user) {
+ cu = strdup(iterator->members_of_group[iterator->index_members_of_group]);
+ if (!cu)
+ return -ENOMEM;
+ }
+
+ if (ret_group) {
+ cg = strdup(iterator->found_group_name);
+ if (!cg)
+ return -ENOMEM;
+ }
+
+ if (ret_user)
+ *ret_user = TAKE_PTR(cu);
+
+ if (ret_group)
+ *ret_group = TAKE_PTR(cg);
+
+ iterator->index_members_of_group++;
+ return 0;
+ }
+
+ iterator->members_of_group = strv_free(iterator->members_of_group);
+ iterator->found_group_name = mfree(iterator->found_group_name);
+ }
+
+ for (; iterator->dropins && iterator->dropins[iterator->current_dropin]; iterator->current_dropin++) {
+ const char *i = iterator->dropins[iterator->current_dropin], *e, *c;
+ _cleanup_free_ char *un = NULL, *gn = NULL;
+
+ e = endswith(i, ".membership");
+ if (!e)
+ continue;
+
+ c = memchr(i, ':', e - i);
+ if (!c)
+ continue;
+
+ un = strndup(i, c - i);
+ if (!un)
+ return -ENOMEM;
+ if (iterator->filter_user_name) {
+ if (!streq(un, iterator->filter_user_name))
+ continue;
+ } else if (!valid_user_group_name(un, VALID_USER_RELAX))
+ continue;
+
+ c++; /* skip over ':' */
+ gn = strndup(c, e - c);
+ if (!gn)
+ return -ENOMEM;
+ if (iterator->filter_group_name) {
+ if (!streq(gn, iterator->filter_group_name))
+ continue;
+ } else if (!valid_user_group_name(gn, VALID_USER_RELAX))
+ continue;
+
+ iterator->current_dropin++;
+ iterator->n_found++;
+
+ if (ret_user)
+ *ret_user = TAKE_PTR(un);
+ if (ret_group)
+ *ret_group = TAKE_PTR(gn);
+
+ return 0;
+ }
+
+ r = userdb_process(iterator, NULL, NULL, ret_user, ret_group);
+ if (r < 0 && iterator->n_found > 0)
+ return -ESRCH;
+
+ return r;
+}
+
+int membershipdb_by_group_strv(const char *name, UserDBFlags flags, char ***ret) {
+ _cleanup_(userdb_iterator_freep) UserDBIterator *iterator = NULL;
+ _cleanup_strv_free_ char **members = NULL;
+ int r;
+
+ assert(name);
+ assert(ret);
+
+ r = membershipdb_by_group(name, flags, &iterator);
+ if (r < 0)
+ return r;
+
+ for (;;) {
+ _cleanup_free_ char *user_name = NULL;
+
+ r = membershipdb_iterator_get(iterator, &user_name, NULL);
+ if (r == -ESRCH)
+ break;
+ if (r < 0)
+ return r;
+
+ r = strv_consume(&members, TAKE_PTR(user_name));
+ if (r < 0)
+ return r;
+ }
+
+ strv_sort(members);
+ strv_uniq(members);
+
+ *ret = TAKE_PTR(members);
+ return 0;
+}
+
+int userdb_block_nss_systemd(int b) {
+ _cleanup_(dlclosep) void *dl = NULL;
+ int (*call)(bool b);
+
+ /* Note that we might be called from libnss_systemd.so.2 itself, but that should be fine, really. */
+
+ dl = dlopen(LIBDIR "/libnss_systemd.so.2", RTLD_LAZY|RTLD_NODELETE);
+ if (!dl) {
+ /* If the file isn't installed, don't complain loudly */
+ log_debug("Failed to dlopen(libnss_systemd.so.2), ignoring: %s", dlerror());
+ return 0;
+ }
+
+ call = (int (*)(bool b)) dlsym(dl, "_nss_systemd_block");
+ if (!call)
+ /* If the file is installed but lacks the symbol we expect, things are weird, let's complain */
+ return log_debug_errno(SYNTHETIC_ERRNO(ELIBBAD),
+ "Unable to find symbol _nss_systemd_block in libnss_systemd.so.2: %s", dlerror());
+
+ return call(b);
+}
diff --git a/src/shared/userdb.h b/src/shared/userdb.h
new file mode 100644
index 0000000..75eb4b2
--- /dev/null
+++ b/src/shared/userdb.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "group-record.h"
+#include "user-record.h"
+
+/* Inquire local services for user/group records */
+
+typedef struct UserDBIterator UserDBIterator;
+
+UserDBIterator *userdb_iterator_free(UserDBIterator *iterator);
+DEFINE_TRIVIAL_CLEANUP_FUNC(UserDBIterator*, userdb_iterator_free);
+
+typedef enum UserDBFlags {
+ /* The main sources */
+ USERDB_EXCLUDE_NSS = 1 << 0, /* don't do client-side nor server-side NSS */
+ USERDB_EXCLUDE_VARLINK = 1 << 1, /* don't talk to any varlink services */
+ USERDB_EXCLUDE_DROPIN = 1 << 2, /* don't load drop-in user/group definitions */
+
+ /* Modifications */
+ USERDB_SUPPRESS_SHADOW = 1 << 3, /* don't do client-side shadow calls (server side might happen though) */
+ USERDB_EXCLUDE_DYNAMIC_USER = 1 << 4, /* exclude looking up in io.systemd.DynamicUser */
+ USERDB_AVOID_MULTIPLEXER = 1 << 5, /* exclude looking up via io.systemd.Multiplexer */
+ USERDB_DONT_SYNTHESIZE = 1 << 6, /* don't synthesize root/nobody */
+
+ /* Combinations */
+ USERDB_NSS_ONLY = USERDB_EXCLUDE_VARLINK|USERDB_EXCLUDE_DROPIN|USERDB_DONT_SYNTHESIZE,
+ USERDB_DROPIN_ONLY = USERDB_EXCLUDE_NSS|USERDB_EXCLUDE_VARLINK|USERDB_DONT_SYNTHESIZE,
+} UserDBFlags;
+
+/* Well-known errors we'll return here:
+ *
+ * -ESRCH: No such user/group
+ * -ELINK: Varlink logic turned off (and no other source available)
+ * -EOPNOTSUPP: Enumeration not supported
+ * -ETIMEDOUT: Time-out
+ */
+
+int userdb_by_name(const char *name, UserDBFlags flags, UserRecord **ret);
+int userdb_by_uid(uid_t uid, UserDBFlags flags, UserRecord **ret);
+int userdb_all(UserDBFlags flags, UserDBIterator **ret);
+int userdb_iterator_get(UserDBIterator *iterator, UserRecord **ret);
+
+int groupdb_by_name(const char *name, UserDBFlags flags, GroupRecord **ret);
+int groupdb_by_gid(gid_t gid, UserDBFlags flags, GroupRecord **ret);
+int groupdb_all(UserDBFlags flags, UserDBIterator **ret);
+int groupdb_iterator_get(UserDBIterator *iterator, GroupRecord **ret);
+
+int membershipdb_by_user(const char *name, UserDBFlags flags, UserDBIterator **ret);
+int membershipdb_by_group(const char *name, UserDBFlags flags, UserDBIterator **ret);
+int membershipdb_all(UserDBFlags flags, UserDBIterator **ret);
+int membershipdb_iterator_get(UserDBIterator *iterator, char **user, char **group);
+int membershipdb_by_group_strv(const char *name, UserDBFlags flags, char ***ret);
+
+int userdb_block_nss_systemd(int b);
diff --git a/src/shared/utmp-wtmp.c b/src/shared/utmp-wtmp.c
new file mode 100644
index 0000000..267b350
--- /dev/null
+++ b/src/shared/utmp-wtmp.c
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <utmpx.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "hostname-util.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "time-util.h"
+#include "user-util.h"
+#include "utmp-wtmp.h"
+
+int utmp_get_runlevel(int *runlevel, int *previous) {
+ _unused_ _cleanup_(utxent_cleanup) bool utmpx = false;
+ struct utmpx *found, lookup = { .ut_type = RUN_LVL };
+ const char *e;
+
+ assert(runlevel);
+
+ /* If these values are set in the environment this takes
+ * precedence. Presumably, sysvinit does this to work around a
+ * race condition that would otherwise exist where we'd always
+ * go to disk and hence might read runlevel data that might be
+ * very new and not apply to the current script being executed. */
+
+ e = getenv("RUNLEVEL");
+ if (!isempty(e)) {
+ *runlevel = e[0];
+ if (previous)
+ *previous = 0;
+
+ return 0;
+ }
+
+ if (utmpxname(_PATH_UTMPX) < 0)
+ return -errno;
+
+ utmpx = utxent_start();
+
+ found = getutxid(&lookup);
+ if (!found)
+ return -errno;
+
+ *runlevel = found->ut_pid & 0xFF;
+ if (previous)
+ *previous = (found->ut_pid >> 8) & 0xFF;
+
+ return 0;
+}
+
+static void init_timestamp(struct utmpx *store, usec_t t) {
+ assert(store);
+
+ if (t <= 0)
+ t = now(CLOCK_REALTIME);
+
+ store->ut_tv.tv_sec = t / USEC_PER_SEC;
+ store->ut_tv.tv_usec = t % USEC_PER_SEC;
+}
+
+static void init_entry(struct utmpx *store, usec_t t) {
+ struct utsname uts = {};
+
+ assert(store);
+
+ init_timestamp(store, t);
+
+ if (uname(&uts) >= 0)
+ strncpy(store->ut_host, uts.release, sizeof(store->ut_host));
+
+ strncpy(store->ut_line, "~", sizeof(store->ut_line)); /* or ~~ ? */
+ strncpy(store->ut_id, "~~", sizeof(store->ut_id));
+}
+
+static int write_entry_utmp(const struct utmpx *store) {
+ _unused_ _cleanup_(utxent_cleanup) bool utmpx = false;
+
+ assert(store);
+
+ /* utmp is similar to wtmp, but there is only one entry for
+ * each entry type resp. user; i.e. basically a key/value
+ * table. */
+
+ if (utmpxname(_PATH_UTMPX) < 0)
+ return -errno;
+
+ utmpx = utxent_start();
+
+ if (pututxline(store))
+ return 0;
+ if (errno == ENOENT) {
+ /* If utmp/wtmp have been disabled, that's a good thing, hence ignore the error. */
+ log_debug_errno(errno, "Not writing utmp: %m");
+ return 0;
+ }
+ return -errno;
+}
+
+static int write_entry_wtmp(const struct utmpx *store) {
+ assert(store);
+
+ /* wtmp is a simple append-only file where each entry is
+ * simply appended to the end; i.e. basically a log. */
+
+ errno = 0;
+ updwtmpx(_PATH_WTMPX, store);
+ if (errno == ENOENT) {
+ /* If utmp/wtmp have been disabled, that's a good thing, hence ignore the error. */
+ log_debug_errno(errno, "Not writing wtmp: %m");
+ return 0;
+ }
+ if (errno == EROFS) {
+ log_warning_errno(errno, "Failed to write wtmp record, ignoring: %m");
+ return 0;
+ }
+ return -errno;
+}
+
+static int write_utmp_wtmp(const struct utmpx *store_utmp, const struct utmpx *store_wtmp) {
+ int r, s;
+
+ r = write_entry_utmp(store_utmp);
+ s = write_entry_wtmp(store_wtmp);
+ return r < 0 ? r : s;
+}
+
+static int write_entry_both(const struct utmpx *store) {
+ return write_utmp_wtmp(store, store);
+}
+
+int utmp_put_shutdown(void) {
+ struct utmpx store = {};
+
+ init_entry(&store, 0);
+
+ store.ut_type = RUN_LVL;
+ strncpy(store.ut_user, "shutdown", sizeof(store.ut_user));
+
+ return write_entry_both(&store);
+}
+
+int utmp_put_reboot(usec_t t) {
+ struct utmpx store = {};
+
+ init_entry(&store, t);
+
+ store.ut_type = BOOT_TIME;
+ strncpy(store.ut_user, "reboot", sizeof(store.ut_user));
+
+ return write_entry_both(&store);
+}
+
+static void copy_suffix(char *buf, size_t buf_size, const char *src) {
+ size_t l;
+
+ l = strlen(src);
+ if (l < buf_size)
+ strncpy(buf, src, buf_size);
+ else
+ memcpy(buf, src + l - buf_size, buf_size);
+}
+
+int utmp_put_init_process(const char *id, pid_t pid, pid_t sid, const char *line, int ut_type, const char *user) {
+ struct utmpx store = {
+ .ut_type = INIT_PROCESS,
+ .ut_pid = pid,
+ .ut_session = sid,
+ };
+ int r;
+
+ assert(id);
+ assert(ut_type != USER_PROCESS || user);
+
+ init_timestamp(&store, 0);
+
+ /* Copy the whole string if it fits, or just the suffix without the terminating NUL. */
+ copy_suffix(store.ut_id, sizeof(store.ut_id), id);
+
+ if (line)
+ strncpy_exact(store.ut_line, line, sizeof(store.ut_line));
+
+ r = write_entry_both(&store);
+ if (r < 0)
+ return r;
+
+ if (IN_SET(ut_type, LOGIN_PROCESS, USER_PROCESS)) {
+ store.ut_type = LOGIN_PROCESS;
+ r = write_entry_both(&store);
+ if (r < 0)
+ return r;
+ }
+
+ if (ut_type == USER_PROCESS) {
+ store.ut_type = USER_PROCESS;
+ strncpy(store.ut_user, user, sizeof(store.ut_user)-1);
+ r = write_entry_both(&store);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int utmp_put_dead_process(const char *id, pid_t pid, int code, int status) {
+ _unused_ _cleanup_(utxent_cleanup) bool utmpx = false;
+ struct utmpx lookup = {
+ .ut_type = INIT_PROCESS /* looks for DEAD_PROCESS, LOGIN_PROCESS, USER_PROCESS, too */
+ }, store, store_wtmp, *found;
+
+ assert(id);
+
+ utmpx = utxent_start();
+
+ /* Copy the whole string if it fits, or just the suffix without the terminating NUL. */
+ copy_suffix(lookup.ut_id, sizeof(lookup.ut_id), id);
+
+ found = getutxid(&lookup);
+ if (!found)
+ return 0;
+
+ if (found->ut_pid != pid)
+ return 0;
+
+ memcpy(&store, found, sizeof(store));
+ store.ut_type = DEAD_PROCESS;
+ store.ut_exit.e_termination = code;
+ store.ut_exit.e_exit = status;
+
+ zero(store.ut_user);
+ zero(store.ut_host);
+ zero(store.ut_tv);
+
+ memcpy(&store_wtmp, &store, sizeof(store_wtmp));
+ /* wtmp wants the current time */
+ init_timestamp(&store_wtmp, 0);
+
+ return write_utmp_wtmp(&store, &store_wtmp);
+}
+
+int utmp_put_runlevel(int runlevel, int previous) {
+ struct utmpx store = {};
+ int r;
+
+ assert(runlevel > 0);
+
+ if (previous <= 0) {
+ /* Find the old runlevel automatically */
+
+ r = utmp_get_runlevel(&previous, NULL);
+ if (r < 0) {
+ if (r != -ESRCH)
+ return r;
+
+ previous = 0;
+ }
+ }
+
+ if (previous == runlevel)
+ return 0;
+
+ init_entry(&store, 0);
+
+ store.ut_type = RUN_LVL;
+ store.ut_pid = (runlevel & 0xFF) | ((previous & 0xFF) << 8);
+ strncpy(store.ut_user, "runlevel", sizeof(store.ut_user));
+
+ return write_entry_both(&store);
+}
diff --git a/src/shared/utmp-wtmp.h b/src/shared/utmp-wtmp.h
new file mode 100644
index 0000000..2e04fac
--- /dev/null
+++ b/src/shared/utmp-wtmp.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+#include "time-util.h"
+
+#if ENABLE_UTMP
+#include <utmpx.h>
+
+int utmp_get_runlevel(int *runlevel, int *previous);
+
+int utmp_put_shutdown(void);
+int utmp_put_reboot(usec_t timestamp);
+int utmp_put_runlevel(int runlevel, int previous);
+
+int utmp_put_dead_process(const char *id, pid_t pid, int code, int status);
+int utmp_put_init_process(const char *id, pid_t pid, pid_t sid, const char *line, int ut_type, const char *user);
+
+static inline bool utxent_start(void) {
+ setutxent();
+ return true;
+}
+static inline void utxent_cleanup(bool *initialized) {
+ assert(initialized);
+ if (*initialized)
+ endutxent();
+}
+
+#else /* ENABLE_UTMP */
+
+static inline int utmp_get_runlevel(int *runlevel, int *previous) {
+ return -ESRCH;
+}
+static inline int utmp_put_shutdown(void) {
+ return 0;
+}
+static inline int utmp_put_reboot(usec_t timestamp) {
+ return 0;
+}
+static inline int utmp_put_runlevel(int runlevel, int previous) {
+ return 0;
+}
+static inline int utmp_put_dead_process(const char *id, pid_t pid, int code, int status) {
+ return 0;
+}
+static inline int utmp_put_init_process(const char *id, pid_t pid, pid_t sid, const char *line, int ut_type, const char *user) {
+ return 0;
+}
+
+#endif /* ENABLE_UTMP */
diff --git a/src/shared/varlink-idl.c b/src/shared/varlink-idl.c
new file mode 100644
index 0000000..655324c
--- /dev/null
+++ b/src/shared/varlink-idl.c
@@ -0,0 +1,1603 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "memstream-util.h"
+#include "strv.h"
+#include "terminal-util.h"
+#include "varlink-idl.h"
+#include "set.h"
+
+#define DEPTH_MAX 64U
+
+enum {
+ COLOR_SYMBOL_TYPE, /* interface, method, type, error */
+ COLOR_FIELD_TYPE, /* string, bool, … */
+ COLOR_IDENTIFIER,
+ COLOR_MARKS, /* [], ->, ?, … */
+ COLOR_RESET,
+ _COLOR_MAX,
+};
+
+static int varlink_idl_format_all_fields(FILE *f, const VarlinkSymbol *symbol, VarlinkFieldDirection direction, const char *indent, const char *const colors[static _COLOR_MAX]);
+
+static int varlink_idl_format_enum_values(
+ FILE *f,
+ const VarlinkSymbol *symbol,
+ const char *indent,
+ const char *const colors[static _COLOR_MAX]) {
+
+ bool first = true;
+
+ assert(f);
+ assert(symbol);
+ assert(symbol->symbol_type == VARLINK_ENUM_TYPE);
+
+ for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) {
+
+ if (first) {
+ first = false;
+ fputs("(\n", f);
+ } else
+ fputs(",\n", f);
+
+ fputs(strempty(indent), f);
+ fputs("\t", f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(field->name, f);
+ fputs(colors[COLOR_RESET], f);
+ }
+
+ if (first)
+ fputs("()", f);
+ else {
+ fputs("\n", f);
+ fputs(strempty(indent), f);
+ fputs(")", f);
+ }
+
+ return 0;
+}
+
+static int varlink_idl_format_field(
+ FILE *f,
+ const VarlinkField *field,
+ const char *indent,
+ const char *const colors[static _COLOR_MAX]) {
+
+ assert(f);
+ assert(field);
+
+ fputs(strempty(indent), f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(field->name, f);
+ fputs(colors[COLOR_RESET], f);
+ fputs(": ", f);
+
+ if (FLAGS_SET(field->field_flags, VARLINK_NULLABLE)) {
+ fputs(colors[COLOR_MARKS], f);
+ fputs("?", f);
+ fputs(colors[COLOR_RESET], f);
+ }
+
+ switch (field->field_flags & (VARLINK_MAP|VARLINK_ARRAY)) {
+
+ case VARLINK_MAP:
+ fputs(colors[COLOR_MARKS], f);
+ fputs("[", f);
+ fputs(colors[COLOR_FIELD_TYPE], f);
+ fputs("string", f);
+ fputs(colors[COLOR_MARKS], f);
+ fputs("]", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_ARRAY:
+ fputs(colors[COLOR_MARKS], f);
+ fputs("[]", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case 0:
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ switch (field->field_type) {
+
+ case VARLINK_BOOL:
+ fputs(colors[COLOR_FIELD_TYPE], f);
+ fputs("bool", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_INT:
+ fputs(colors[COLOR_FIELD_TYPE], f);
+ fputs("int", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_FLOAT:
+ fputs(colors[COLOR_FIELD_TYPE], f);
+ fputs("float", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_STRING:
+ fputs(colors[COLOR_FIELD_TYPE], f);
+ fputs("string", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_OBJECT:
+ fputs(colors[COLOR_FIELD_TYPE], f);
+ fputs("object", f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_NAMED_TYPE:
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(ASSERT_PTR(field->named_type), f);
+ fputs(colors[COLOR_RESET], f);
+ break;
+
+ case VARLINK_STRUCT:
+ return varlink_idl_format_all_fields(f, ASSERT_PTR(field->symbol), VARLINK_REGULAR, indent, colors);
+
+ case VARLINK_ENUM:
+ return varlink_idl_format_enum_values(f, ASSERT_PTR(field->symbol), indent, colors);
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int varlink_idl_format_all_fields(
+ FILE *f,
+ const VarlinkSymbol *symbol,
+ VarlinkFieldDirection filter_direction,
+ const char *indent,
+ const char *const colors[static _COLOR_MAX]) {
+
+ _cleanup_free_ char *indent2 = NULL;
+ bool first = true;
+ int r;
+
+ assert(f);
+ assert(symbol);
+ assert(IN_SET(symbol->symbol_type, VARLINK_STRUCT_TYPE, VARLINK_METHOD, VARLINK_ERROR));
+
+ indent2 = strjoin(strempty(indent), "\t");
+ if (!indent2)
+ return -ENOMEM;
+
+ for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) {
+
+ if (field->field_direction != filter_direction)
+ continue;
+
+ if (first) {
+ first = false;
+ fputs("(\n", f);
+ } else
+ fputs(",\n", f);
+
+ r = varlink_idl_format_field(f, field, indent2, colors);
+ if (r < 0)
+ return r;
+ }
+
+ if (first)
+ fputs("()", f);
+ else {
+ fputs("\n", f);
+ fputs(strempty(indent), f);
+ fputs(")", f);
+ }
+
+ return 0;
+}
+
+static int varlink_idl_format_symbol(
+ FILE *f,
+ const VarlinkSymbol *symbol,
+ const char *const colors[static _COLOR_MAX]) {
+ int r;
+
+ assert(f);
+ assert(symbol);
+
+ switch (symbol->symbol_type) {
+
+ case VARLINK_ENUM_TYPE:
+ fputs(colors[COLOR_SYMBOL_TYPE], f);
+ fputs("type ", f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(symbol->name, f);
+ fputs(colors[COLOR_RESET], f);
+
+ r = varlink_idl_format_enum_values(f, symbol, /* indent= */ NULL, colors);
+ break;
+
+ case VARLINK_STRUCT_TYPE:
+ fputs(colors[COLOR_SYMBOL_TYPE], f);
+ fputs("type ", f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(symbol->name, f);
+ fputs(colors[COLOR_RESET], f);
+
+ r = varlink_idl_format_all_fields(f, symbol, VARLINK_REGULAR, /* indent= */ NULL, colors);
+ break;
+
+ case VARLINK_METHOD:
+ fputs(colors[COLOR_SYMBOL_TYPE], f);
+ fputs("method ", f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(symbol->name, f);
+ fputs(colors[COLOR_RESET], f);
+
+ r = varlink_idl_format_all_fields(f, symbol, VARLINK_INPUT, /* indent= */ NULL, colors);
+ if (r < 0)
+ return r;
+
+ fputs(colors[COLOR_MARKS], f);
+ fputs(" -> ", f);
+ fputs(colors[COLOR_RESET], f);
+
+ r = varlink_idl_format_all_fields(f, symbol, VARLINK_OUTPUT, /* indent= */ NULL, colors);
+ break;
+
+ case VARLINK_ERROR:
+ fputs(colors[COLOR_SYMBOL_TYPE], f);
+ fputs("error ", f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(symbol->name, f);
+ fputs(colors[COLOR_RESET], f);
+
+ r = varlink_idl_format_all_fields(f, symbol, VARLINK_REGULAR, /* indent= */ NULL, colors);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ if (r < 0)
+ return r;
+
+ fputs("\n", f);
+ return 0;
+}
+
+static int varlink_idl_format_all_symbols(
+ FILE *f,
+ const VarlinkInterface *interface,
+ VarlinkSymbolType filter_type,
+ const char *const colors[static _COLOR_MAX]) {
+
+ int r;
+
+ assert(f);
+ assert(interface);
+
+ for (const VarlinkSymbol *const*symbol = interface->symbols; *symbol; symbol++) {
+
+ if ((*symbol)->symbol_type != filter_type)
+ continue;
+
+ fputs("\n", f);
+
+ r = varlink_idl_format_symbol(f, *symbol, colors);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int varlink_idl_dump(FILE *f, int use_colors, const VarlinkInterface *interface) {
+ static const char* const color_table[_COLOR_MAX] = {
+ [COLOR_SYMBOL_TYPE] = ANSI_HIGHLIGHT_GREEN,
+ [COLOR_FIELD_TYPE] = ANSI_HIGHLIGHT_BLUE,
+ [COLOR_IDENTIFIER] = ANSI_NORMAL,
+ [COLOR_MARKS] = ANSI_HIGHLIGHT_MAGENTA,
+ [COLOR_RESET] = ANSI_NORMAL,
+ };
+
+ static const char* const color_off[_COLOR_MAX] = {
+ "", "", "", "", "",
+ };
+
+ int r;
+
+ assert(interface);
+
+ if (!f)
+ f = stdout;
+
+ if (use_colors < 0)
+ use_colors = colors_enabled();
+
+ const char *const *colors = use_colors ? color_table : color_off;
+
+ fputs(colors[COLOR_SYMBOL_TYPE], f);
+ fputs("interface ", f);
+ fputs(colors[COLOR_IDENTIFIER], f);
+ fputs(ASSERT_PTR(interface->name), f);
+ fputs(colors[COLOR_RESET], f);
+ fputs("\n", f);
+
+ for (VarlinkSymbolType t = 0; t < _VARLINK_SYMBOL_TYPE_MAX; t++) {
+ r = varlink_idl_format_all_symbols(f, interface, t, colors);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int varlink_idl_format(const VarlinkInterface *interface, char **ret) {
+ _cleanup_(memstream_done) MemStream memstream = {};
+ int r;
+
+ if (!memstream_init(&memstream))
+ return -errno;
+
+ r = varlink_idl_dump(memstream.f, /* use_colors= */ false, interface);
+ if (r < 0)
+ return r;
+
+ return memstream_finalize(&memstream, ret, NULL);
+}
+
+static VarlinkSymbol *varlink_symbol_free(VarlinkSymbol *symbol) {
+ if (!symbol)
+ return NULL;
+
+ /* See comment in varlink_interface_free() regarding the casting away of `const` */
+
+ free((char*) symbol->name);
+
+ for (size_t i = 0; symbol->fields[i].field_type != _VARLINK_FIELD_TYPE_END_MARKER; i++) {
+ VarlinkField *field = symbol->fields + i;
+
+ free((void*) field->name);
+ free((void*) field->named_type);
+
+ /* The symbol pointer might either point to a named symbol, in which case that symbol is
+ * owned by the interface, or by an anomyous symbol, in which case it is owned by us, and we
+ * need to free it */
+ if (field->symbol && field->field_type != VARLINK_NAMED_TYPE)
+ varlink_symbol_free((VarlinkSymbol*) field->symbol);
+ }
+
+ return mfree(symbol);
+}
+
+VarlinkInterface* varlink_interface_free(VarlinkInterface *interface) {
+ if (!interface)
+ return NULL;
+
+ /* So here's the thing: in most cases we want that users of this define their interface descriptions
+ * in C code, and hence the definitions are constant and immutable during the lifecycle of the
+ * system. Because of that we define all structs with const* pointers. It makes it very nice and
+ * straight-forward to populate these structs with literal C strings. However, in some not so common
+ * cases we also want to allocate these structures dynamically on the heap, when parsing interface
+ * descriptions. But given this should be the exceptional and not the common case, we decided to
+ * simple cast away the 'const' where needed, even if it is ugly. */
+
+ free((char*) interface->name);
+
+ for (size_t i = 0; interface->symbols[i]; i++)
+ varlink_symbol_free((VarlinkSymbol*) interface->symbols[i]);
+
+ return mfree(interface);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkSymbol*, varlink_symbol_free);
+
+static int varlink_interface_realloc(VarlinkInterface **interface, size_t n_symbols) {
+ VarlinkInterface *n;
+
+ assert(interface);
+
+ n_symbols ++; /* Space for trailing NULL end marker symbol */
+
+ /* Overflow check */
+ if (n_symbols > (SIZE_MAX - offsetof(VarlinkInterface, symbols)) / sizeof(VarlinkSymbol*))
+ return -ENOMEM;
+
+ n = realloc0(*interface, offsetof(VarlinkInterface, symbols) + sizeof(VarlinkSymbol*) * n_symbols);
+ if (!n)
+ return -ENOMEM;
+
+ *interface = n;
+ return 0;
+}
+
+static int varlink_symbol_realloc(VarlinkSymbol **symbol, size_t n_fields) {
+ VarlinkSymbol *n;
+
+ assert(symbol);
+
+ n_fields ++; /* Space for trailing end marker field */
+
+ /* Overflow check */
+ if (n_fields > (SIZE_MAX - offsetof(VarlinkSymbol, fields)) / sizeof(VarlinkField))
+ return -ENOMEM;
+
+ n = realloc0(*symbol, offsetof(VarlinkSymbol, fields) + sizeof(VarlinkField) * n_fields);
+ if (!n)
+ return -ENOMEM;
+
+ *symbol = n;
+ return 0;
+}
+
+#define VALID_CHARS_IDENTIFIER ALPHANUMERICAL "_"
+#define VALID_CHARS_RESERVED LOWERCASE_LETTERS
+#define VALID_CHARS_INTERFACE_NAME ALPHANUMERICAL ".-"
+
+static void advance_line_column(const char *p, size_t n, unsigned *line, unsigned *column) {
+
+ assert(p);
+ assert(line);
+ assert(column);
+
+ for (; n > 0; p++, n--) {
+
+ if (*p == '\n') {
+ (*line)++;
+ *column = 1;
+ } else
+ (*column)++;
+ }
+}
+
+static size_t token_match(
+ const char *p,
+ const char *allowed_delimiters,
+ const char *allowed_chars) {
+
+ /* Checks if the string p begins either with one of the token characters in allowed_delimiters or
+ * with a string consisting of allowed_chars. */
+
+ assert(p);
+
+ if (allowed_delimiters && strchr(allowed_delimiters, *p))
+ return 1;
+
+ if (!allowed_chars)
+ return 0;
+
+ return strspn(p, allowed_chars);
+}
+
+static int varlink_idl_subparse_token(
+ const char **p,
+ unsigned *line,
+ unsigned *column,
+ const char *allowed_delimiters,
+ const char *allowed_chars,
+ char **ret_token) {
+
+ _cleanup_free_ char *t = NULL;
+ size_t l;
+
+ assert(p);
+ assert(*p);
+ assert(line);
+ assert(column);
+ assert(ret_token);
+
+ if (**p == '\0') { /* eof */
+ *ret_token = NULL;
+ return 0;
+ }
+
+ l = token_match(*p, allowed_delimiters, allowed_chars);
+
+ /* No token of the permitted character set found? Then let's try to skip over whitespace and try again */
+ if (l == 0) {
+ size_t ll;
+
+ ll = strspn(*p, WHITESPACE);
+ advance_line_column(*p, ll, line, column);
+ *p += ll;
+
+ if (**p == '\0') { /* eof */
+ *ret_token = NULL;
+ return 0;
+ }
+
+ l = token_match(*p, allowed_delimiters, allowed_chars);
+ if (l == 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Couldn't find token of allowed chars '%s' or allowed delimiters '%s'.", strempty(allowed_chars), strempty(allowed_delimiters));
+ }
+
+ t = strndup(*p, l);
+ if (!t)
+ return -ENOMEM;
+
+ advance_line_column(*p, l, line, column);
+ *p += l;
+
+ *ret_token = TAKE_PTR(t);
+ return 1;
+}
+
+static int varlink_idl_subparse_comment(
+ const char **p,
+ unsigned *line,
+ unsigned *column) {
+
+ size_t l;
+
+ assert(p);
+ assert(*p);
+ assert(line);
+ assert(column);
+
+ l = strcspn(*p, NEWLINE);
+ advance_line_column(*p, l + 1, line, column);
+ *p += l;
+
+ return 1;
+}
+
+static int varlink_idl_subparse_whitespace(
+ const char **p,
+ unsigned *line,
+ unsigned *column) {
+
+ size_t l;
+
+ assert(p);
+ assert(*p);
+ assert(line);
+ assert(column);
+
+ l = strspn(*p, WHITESPACE);
+ advance_line_column(*p, l, line, column);
+ *p += l;
+
+ return 1;
+}
+
+static int varlink_idl_subparse_struct_or_enum(const char **p, unsigned *line, unsigned *column, VarlinkSymbol **symbol, size_t *n_fields, VarlinkFieldDirection direction, unsigned depth);
+
+static int varlink_idl_subparse_field_type(
+ const char **p,
+ unsigned *line,
+ unsigned *column,
+ VarlinkField *field,
+ unsigned depth) {
+
+ size_t l;
+ int r;
+
+ assert(p);
+ assert(*p);
+ assert(line);
+ assert(field);
+
+ r = varlink_idl_subparse_whitespace(p, line, column);
+ if (r < 0)
+ return r;
+
+ if (startswith(*p, "?")) {
+ field->field_flags |= VARLINK_NULLABLE;
+ l = 1;
+ } else {
+ field->field_flags &= ~VARLINK_NULLABLE;
+ l = 0;
+ }
+
+ advance_line_column(*p, l, line, column);
+ *p += l;
+
+ if (startswith(*p, "[]")) {
+ l = 2;
+ field->field_flags = (field->field_flags & ~VARLINK_MAP) | VARLINK_ARRAY;
+ } else if (startswith(*p, "[string]")) {
+ l = 8;
+ field->field_flags = (field->field_flags & ~VARLINK_ARRAY) | VARLINK_MAP;
+ } else {
+ l = 0;
+ field->field_flags = field->field_flags & ~(VARLINK_MAP | VARLINK_ARRAY);
+ }
+
+ advance_line_column(*p, l, line, column);
+ *p += l;
+
+ if (startswith(*p, "bool")) {
+ l = 4;
+ field->field_type = VARLINK_BOOL;
+ } else if (startswith(*p, "int")) {
+ l = 3;
+ field->field_type = VARLINK_INT;
+ } else if (startswith(*p, "float")) {
+ l = 5;
+ field->field_type = VARLINK_FLOAT;
+ } else if (startswith(*p, "string")) {
+ l = 6;
+ field->field_type = VARLINK_STRING;
+ } else if (startswith(*p, "object")) {
+ l = 6;
+ field->field_type = VARLINK_OBJECT;
+ } else if (**p == '(') {
+ _cleanup_(varlink_symbol_freep) VarlinkSymbol *symbol = NULL;
+ size_t n_fields = 0;
+
+ r = varlink_symbol_realloc(&symbol, n_fields);
+ if (r < 0)
+ return r;
+
+ symbol->symbol_type = _VARLINK_SYMBOL_TYPE_INVALID;
+
+ r = varlink_idl_subparse_struct_or_enum(
+ p,
+ line,
+ column,
+ &symbol,
+ &n_fields,
+ VARLINK_REGULAR,
+ depth + 1);
+ if (r < 0)
+ return r;
+
+ if (symbol->symbol_type == VARLINK_STRUCT_TYPE)
+ field->field_type = VARLINK_STRUCT;
+ else {
+ assert(symbol->symbol_type == VARLINK_ENUM_TYPE);
+ field->field_type = VARLINK_ENUM;
+ }
+
+ field->symbol = TAKE_PTR(symbol);
+ l = 0;
+ } else {
+ _cleanup_free_ char *token = NULL;
+
+ r = varlink_idl_subparse_token(p, line, column, /* valid_tokens= */ NULL, VALID_CHARS_IDENTIFIER, &token);
+ if (r < 0)
+ return r;
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ field->named_type = TAKE_PTR(token);
+ field->field_type = VARLINK_NAMED_TYPE;
+ l = 0;
+ }
+
+ advance_line_column(*p, l, line, column);
+ *p += l;
+
+ return 0;
+}
+
+static int varlink_idl_subparse_struct_or_enum(
+ const char **p,
+ unsigned *line,
+ unsigned *column,
+ VarlinkSymbol **symbol,
+ size_t *n_fields,
+ VarlinkFieldDirection direction,
+ unsigned depth) {
+
+ enum {
+ STATE_OPEN,
+ STATE_NAME,
+ STATE_COLON,
+ STATE_COMMA,
+ STATE_DONE,
+ } state = STATE_OPEN;
+ _cleanup_free_ char *field_name = NULL;
+ const char *allowed_delimiters = "(", *allowed_chars = NULL;
+ int r;
+
+ assert(p);
+ assert(*p);
+ assert(line);
+ assert(column);
+ assert(symbol);
+ assert(*symbol);
+ assert(n_fields);
+
+ if (depth > DEPTH_MAX)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Maximum nesting depth reached (%u).", *line, *column, DEPTH_MAX);
+
+ while (state != STATE_DONE) {
+ _cleanup_free_ char *token = NULL;
+
+ r = varlink_idl_subparse_token(
+ p,
+ line,
+ column,
+ allowed_delimiters,
+ allowed_chars,
+ &token);
+ if (r < 0)
+ return r;
+
+ switch (state) {
+
+ case STATE_OPEN:
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+ if (!streq(token, "("))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token);
+
+ state = STATE_NAME;
+ allowed_delimiters = ")";
+ allowed_chars = VALID_CHARS_IDENTIFIER;
+ break;
+
+ case STATE_NAME:
+ assert(!field_name);
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+ if (streq(token, ")"))
+ state = STATE_DONE;
+ else {
+ field_name = TAKE_PTR(token);
+ state = STATE_COLON;
+ allowed_delimiters = ":,)";
+ allowed_chars = NULL;
+ }
+
+ break;
+
+ case STATE_COLON:
+ assert(field_name);
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ if (streq(token, ":")) {
+ VarlinkField *field;
+
+ if ((*symbol)->symbol_type < 0)
+ (*symbol)->symbol_type = VARLINK_STRUCT_TYPE;
+ if ((*symbol)->symbol_type == VARLINK_ENUM_TYPE)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Enum with struct fields, refusing.", *line, *column);
+
+ r = varlink_symbol_realloc(symbol, *n_fields + 1);
+ if (r < 0)
+ return r;
+
+ field = (*symbol)->fields + (*n_fields)++;
+ *field = (VarlinkField) {
+ .name = TAKE_PTR(field_name),
+ .field_type = _VARLINK_FIELD_TYPE_INVALID,
+ .field_direction = direction,
+ };
+
+ r = varlink_idl_subparse_field_type(p, line, column, field, depth);
+ if (r < 0)
+ return r;
+
+ state = STATE_COMMA;
+ allowed_delimiters = ",)";
+ allowed_chars = NULL;
+
+ } else if (STR_IN_SET(token, ",", ")")) {
+ VarlinkField *field;
+
+ if ((*symbol)->symbol_type < 0)
+ (*symbol)->symbol_type = VARLINK_ENUM_TYPE;
+ if ((*symbol)->symbol_type != VARLINK_ENUM_TYPE)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Struct with enum fields, refusing.", *line, *column);
+
+ r = varlink_symbol_realloc(symbol, *n_fields + 1);
+ if (r < 0)
+ return r;
+
+ field = (*symbol)->fields + (*n_fields)++;
+ *field = (VarlinkField) {
+ .name = TAKE_PTR(field_name),
+ .field_type = VARLINK_ENUM_VALUE,
+ };
+
+ if (streq(token, ",")) {
+ state = STATE_NAME;
+ allowed_delimiters = NULL;
+ allowed_chars = VALID_CHARS_IDENTIFIER;
+ } else {
+ assert(streq(token, ")"));
+ state = STATE_DONE;
+ }
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token);
+
+ break;
+
+ case STATE_COMMA:
+ assert(!field_name);
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+ if (streq(token, ",")) {
+ state = STATE_NAME;
+ allowed_delimiters = NULL;
+ allowed_chars = VALID_CHARS_IDENTIFIER;
+ } else if (streq(token, ")"))
+ state = STATE_DONE;
+ else
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ /* If we don't know the type of the symbol by now it was an empty () which doesn't allow us to
+ * determine if we look at an enum or a struct */
+ if ((*symbol)->symbol_type < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Ambiguous empty () enum/struct is not permitted.", *line, *column);
+
+ return 0;
+}
+
+static int varlink_idl_resolve_symbol_types(VarlinkInterface *interface, VarlinkSymbol *symbol) {
+ assert(interface);
+ assert(symbol);
+
+ for (VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) {
+ const VarlinkSymbol *found;
+
+ if (field->field_type != VARLINK_NAMED_TYPE)
+ continue;
+
+ if (field->symbol) /* Already resolved */
+ continue;
+
+ if (!field->named_type)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENETUNREACH), "Named type field lacking a type name.");
+
+ found = varlink_idl_find_symbol(interface, _VARLINK_SYMBOL_TYPE_INVALID, field->named_type);
+ if (!found)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENETUNREACH), "Failed to find type '%s'.", field->named_type);
+
+ if (!IN_SET(found->symbol_type, VARLINK_STRUCT_TYPE, VARLINK_ENUM_TYPE))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENETUNREACH), "Symbol '%s' is referenced as type but is not a type.", field->named_type);
+
+ field->symbol = found;
+ }
+
+ return 0;
+}
+
+static int varlink_idl_resolve_types(VarlinkInterface *interface) {
+ int r;
+
+ assert(interface);
+
+ for (VarlinkSymbol **symbol = (VarlinkSymbol**) interface->symbols; *symbol; symbol++) {
+ r = varlink_idl_resolve_symbol_types(interface, *symbol);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int varlink_idl_parse(
+ const char *text,
+ unsigned *line,
+ unsigned *column,
+ VarlinkInterface **ret) {
+
+ _cleanup_(varlink_interface_freep) VarlinkInterface *interface = NULL;
+ _cleanup_(varlink_symbol_freep) VarlinkSymbol *symbol = NULL;
+ enum {
+ STATE_PRE_INTERFACE,
+ STATE_INTERFACE,
+ STATE_PRE_SYMBOL,
+ STATE_METHOD,
+ STATE_METHOD_NAME,
+ STATE_METHOD_ARROW,
+ STATE_TYPE,
+ STATE_TYPE_NAME,
+ STATE_ERROR,
+ STATE_ERROR_NAME,
+ STATE_DONE,
+ } state = STATE_PRE_INTERFACE;
+ const char *allowed_delimiters = "#", *allowed_chars = VALID_CHARS_RESERVED;
+ size_t n_symbols = 0, n_fields = 1;
+ unsigned _line = 0, _column = 1;
+ const char **p = &text;
+ int r;
+
+ if (!line)
+ line = &_line;
+ if (!column)
+ column = &_column;
+
+ while (state != STATE_DONE) {
+ _cleanup_free_ char *token = NULL;
+
+ r = varlink_idl_subparse_token(
+ p,
+ line,
+ column,
+ allowed_delimiters,
+ allowed_chars,
+ &token);
+ if (r < 0)
+ return r;
+
+ switch (state) {
+
+ case STATE_PRE_INTERFACE:
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+ if (streq(token, "#")) {
+ r = varlink_idl_subparse_comment(&text, line, column);
+ if (r < 0)
+ return r;
+ } else if (streq(token, "interface")) {
+ state = STATE_INTERFACE;
+ allowed_delimiters = NULL;
+ allowed_chars = VALID_CHARS_INTERFACE_NAME;
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token);
+ break;
+
+ case STATE_INTERFACE:
+ assert(!interface);
+ assert(n_symbols == 0);
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ r = varlink_interface_realloc(&interface, n_symbols);
+ if (r < 0)
+ return r;
+
+ interface->name = TAKE_PTR(token);
+ state = STATE_PRE_SYMBOL;
+ allowed_delimiters = "#";
+ allowed_chars = VALID_CHARS_RESERVED;
+ break;
+
+ case STATE_PRE_SYMBOL:
+ if (!token) {
+ state = STATE_DONE;
+ break;
+ }
+
+ if (streq(token, "#")) {
+ r = varlink_idl_subparse_comment(&text, line, column);
+ if (r < 0)
+ return r;
+ } else if (streq(token, "method")) {
+ state = STATE_METHOD;
+ allowed_chars = VALID_CHARS_IDENTIFIER;
+ } else if (streq(token, "type")) {
+ state = STATE_TYPE;
+ allowed_chars = VALID_CHARS_IDENTIFIER;
+ } else if (streq(token, "error")) {
+ state = STATE_ERROR;
+ allowed_chars = VALID_CHARS_IDENTIFIER;
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token);
+
+ break;
+
+ case STATE_METHOD:
+ assert(!symbol);
+ n_fields = 0;
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ r = varlink_symbol_realloc(&symbol, n_fields);
+ if (r < 0)
+ return r;
+
+ symbol->symbol_type = VARLINK_METHOD;
+ symbol->name = TAKE_PTR(token);
+
+ r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_INPUT, 0);
+ if (r < 0)
+ return r;
+
+ state = STATE_METHOD_ARROW;
+ allowed_chars = "->";
+ break;
+
+ case STATE_METHOD_ARROW:
+ assert(symbol);
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ if (!streq(token, "->"))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Unexpected token '%s'.", *line, *column, token);
+
+ r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_OUTPUT, 0);
+ if (r < 0)
+ return r;
+
+ r = varlink_interface_realloc(&interface, n_symbols + 1);
+ if (r < 0)
+ return r;
+
+ interface->symbols[n_symbols++] = TAKE_PTR(symbol);
+
+ state = STATE_PRE_SYMBOL;
+ allowed_chars = VALID_CHARS_RESERVED "#";
+ break;
+
+ case STATE_TYPE:
+ assert(!symbol);
+ n_fields = 0;
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ r = varlink_symbol_realloc(&symbol, n_fields);
+ if (r < 0)
+ return r;
+
+ symbol->symbol_type = _VARLINK_SYMBOL_TYPE_INVALID; /* don't know yet if enum or struct, will be field in by varlink_idl_subparse_struct_or_enum() */
+ symbol->name = TAKE_PTR(token);
+
+ r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_REGULAR, 0);
+ if (r < 0)
+ return r;
+
+ r = varlink_interface_realloc(&interface, n_symbols + 1);
+ if (r < 0)
+ return r;
+
+ interface->symbols[n_symbols++] = TAKE_PTR(symbol);
+
+ state = STATE_PRE_SYMBOL;
+ allowed_chars = VALID_CHARS_RESERVED "#";
+ break;
+
+ case STATE_ERROR:
+ assert(!symbol);
+ n_fields = 0;
+
+ if (!token)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "%u:%u: Premature EOF.", *line, *column);
+
+ r = varlink_symbol_realloc(&symbol, n_fields);
+ if (r < 0)
+ return r;
+
+ symbol->symbol_type = VARLINK_ERROR;
+ symbol->name = TAKE_PTR(token);
+
+ r = varlink_idl_subparse_struct_or_enum(&text, line, column, &symbol, &n_fields, VARLINK_REGULAR, 0);
+ if (r < 0)
+ return r;
+
+ r = varlink_interface_realloc(&interface, n_symbols + 1);
+ if (r < 0)
+ return r;
+
+ interface->symbols[n_symbols++] = TAKE_PTR(symbol);
+
+ state = STATE_PRE_SYMBOL;
+ allowed_chars = VALID_CHARS_RESERVED "#";
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ r = varlink_idl_resolve_types(interface);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(interface);
+ return 0;
+}
+
+bool varlink_idl_field_name_is_valid(const char *name) {
+ if (isempty(name))
+ return false;
+
+ /* Field names may start with lower or uppercase char, but no numerals or underscore */
+ if (!strchr(LETTERS, name[0]))
+ return false;
+
+ /* Otherwise fields may be alphanumerical or underscore, but no two underscore may immediately follow
+ * each other or be trailing */
+ bool underscore = false;
+ for (const char *c = name + 1; *c; c++) {
+ if (*c == '_') {
+ if (underscore)
+ return false;
+
+ underscore = true;
+ continue;
+ }
+
+ if (!strchr(ALPHANUMERICAL, *c))
+ return false;
+
+ underscore = false;
+ }
+
+ if (underscore)
+ return false;
+
+ return true;
+}
+
+bool varlink_idl_symbol_name_is_valid(const char *name) {
+ if (isempty(name))
+ return false;
+
+ /* We might want to reference VARLINK_STRUCT_TYPE and VARLINK_ENUM_TYPE symbols where we also
+ * reference native types, hence make sure the native type names are refused as symbol names. */
+ if (STR_IN_SET(name, "bool", "int", "float", "string", "object"))
+ return false;
+
+ /* Symbols must be named with an uppercase letter as first character */
+ if (!strchr(UPPERCASE_LETTERS, name[0]))
+ return false;
+
+ for (const char *c = name + 1; *c; c++)
+ if (!strchr(ALPHANUMERICAL, *c))
+ return false;
+
+ return true;
+}
+
+bool varlink_idl_interface_name_is_valid(const char *name) {
+ if (isempty(name))
+ return false;
+
+ /* Interface names must start with a letter, uppercase or lower case, but nothing else */
+ if (!strchr(LETTERS, name[0]))
+ return false;
+
+ /* Otherwise it may be a series of non-empty dot separated labels, which are alphanumerical and may
+ * contain single dashes in the middle */
+ bool dot = false, dash = false;
+ for (const char *c = name + 1; *c; c++) {
+ switch (*c) {
+
+ case '.':
+ if (dot || dash)
+ return false;
+
+ dot = true;
+ break;
+
+ case '-':
+ if (dot || dash)
+ return false;
+
+ dash = true;
+ break;
+
+ default:
+ if (!strchr(ALPHANUMERICAL, *c))
+ return false;
+
+ dot = dash = false;
+ }
+ }
+
+ if (dot || dash)
+ return false;
+
+ return true;
+}
+
+static int varlink_idl_symbol_consistent(const VarlinkInterface *interface, const VarlinkSymbol *symbol, int level);
+
+static int varlink_idl_field_consistent(
+ const VarlinkInterface *interface,
+ const VarlinkSymbol *symbol,
+ const VarlinkField *field,
+ int level) {
+
+ const char *symbol_name;
+ int r;
+
+ assert(interface);
+ assert(symbol);
+ assert(field);
+ assert(field->name);
+
+ symbol_name = symbol->name ?: "<anonymous>";
+
+ if (field->field_type <= 0 || field->field_type >= _VARLINK_FIELD_TYPE_MAX)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Field type for '%s' in symbol '%s' is not valid, refusing.", field->name, symbol_name);
+
+ if (field->field_type == VARLINK_ENUM_VALUE) {
+
+ if (symbol->symbol_type != VARLINK_ENUM_TYPE)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Enum field type for '%s' in non-enum symbol '%s', refusing.", field->name, symbol_name);
+
+ if (field->field_flags != 0)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Enum field '%s' in symbol '%s' has non-zero flags set, refusing.", field->name, symbol_name);
+ } else {
+ if (symbol->symbol_type == VARLINK_ENUM_TYPE)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Non-enum field type for '%s' in enum symbol '%s', refusing.", field->name, symbol_name);
+
+ if (!IN_SET(field->field_flags & ~VARLINK_NULLABLE, 0, VARLINK_ARRAY, VARLINK_MAP))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Flags of field '%s' in symbol '%s' is invalid, refusing.", field->name, symbol_name);
+ }
+
+ if (symbol->symbol_type != VARLINK_METHOD) {
+ if (field->field_direction != VARLINK_REGULAR)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Direction of '%s' in non-method symbol '%s' not regular, refusing.", field->name, symbol_name);
+ } else {
+ if (!IN_SET(field->field_direction, VARLINK_INPUT, VARLINK_OUTPUT))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Direction of '%s' in method symbol '%s' is not input or output, refusing.", field->name, symbol_name);
+ }
+
+ if (field->symbol) {
+ if (!IN_SET(field->field_type, VARLINK_STRUCT, VARLINK_ENUM, VARLINK_NAMED_TYPE))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Target symbol for field '%s' in symbol '%s' defined for elemental field, refusing.", field->name, symbol_name);
+
+ if (field->field_type == VARLINK_NAMED_TYPE) {
+ const VarlinkSymbol *found;
+
+ if (!field->symbol->name || !field->named_type || !streq(field->symbol->name, field->named_type))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Resolved symbol name and named type of field '%s' in symbol '%s' do do not match, refusing.", field->name, symbol_name);
+
+ /* If this is a named type, then check if it's properly part of the interface */
+ found = varlink_idl_find_symbol(interface, _VARLINK_SYMBOL_TYPE_INVALID, field->symbol->name);
+ if (!found)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Resolved symbol of named type of field '%s' in symbol '%s' is not part of the interface, refusing.", field->name, symbol_name);
+
+ if (!IN_SET(found->symbol_type, VARLINK_ENUM_TYPE, VARLINK_STRUCT_TYPE))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Resolved symbol of named type of field '%s' in symbol '%s' is not a type, refusing.", field->name, symbol_name);
+ } else {
+ /* If this is an anonymous type, then we recursively check if it's consistent, since
+ * it's not part of the interface, and hence we won't validate it from there. */
+
+ r = varlink_idl_symbol_consistent(interface, field->symbol, level);
+ if (r < 0)
+ return r;
+ }
+
+ } else {
+ if (IN_SET(field->field_type, VARLINK_STRUCT, VARLINK_ENUM, VARLINK_NAMED_TYPE))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "No target symbol for field '%s' in symbol '%s' defined for elemental field, refusing.", field->name, symbol_name);
+
+ if (field->named_type)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Unresolved symbol in field '%s' in symbol '%s', refusing.", field->name, symbol_name);
+ }
+
+ if (field->named_type) {
+ if (field->field_type != VARLINK_NAMED_TYPE)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Named type set for field '%s' in symbol '%s' but not a named type field, refusing.", field->name, symbol_name);
+ } else {
+ if (field->field_type == VARLINK_NAMED_TYPE)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "No named type set for field '%s' in symbol '%s' but field is a named type field, refusing.", field->name, symbol_name);
+ }
+
+ return 0;
+}
+
+static bool varlink_symbol_is_empty(const VarlinkSymbol *symbol) {
+ assert(symbol);
+
+ return symbol->fields[0].field_type == _VARLINK_FIELD_TYPE_END_MARKER;
+}
+
+static int varlink_idl_symbol_consistent(
+ const VarlinkInterface *interface,
+ const VarlinkSymbol *symbol,
+ int level) {
+
+ _cleanup_(set_freep) Set *input_set = NULL, *output_set = NULL;
+ const char *symbol_name;
+ int r;
+
+ assert(interface);
+ assert(symbol);
+
+ symbol_name = symbol->name ?: "<anonymous>";
+
+ if (symbol->symbol_type < 0 || symbol->symbol_type >= _VARLINK_SYMBOL_TYPE_MAX)
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Symbol type for '%s' is not valid, refusing.", symbol_name);
+
+ if (IN_SET(symbol->symbol_type, VARLINK_STRUCT_TYPE, VARLINK_ENUM_TYPE) && varlink_symbol_is_empty(symbol))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Symbol '%s' is empty, refusing.", symbol_name);
+
+ for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) {
+ Set **name_set = field->field_direction == VARLINK_OUTPUT ? &output_set : &input_set; /* for the method case we need two separate sets, otherwise we use the same */
+
+ if (!varlink_idl_field_name_is_valid(field->name))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Field name '%s' in symbol '%s' not valid, refusing.", field->name, symbol_name);
+
+ if (set_contains(*name_set, field->name))
+ return log_full_errno(level, SYNTHETIC_ERRNO(ENOTUNIQ), "Field '%s' defined twice in symbol '%s', refusing.", field->name, symbol_name);
+
+ if (set_ensure_put(name_set, &string_hash_ops, field->name) < 0)
+ return log_oom();
+
+ r = varlink_idl_field_consistent(interface, symbol, field, level);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+int varlink_idl_consistent(const VarlinkInterface *interface, int level) {
+ _cleanup_(set_freep) Set *name_set = NULL;
+ int r;
+
+ assert(interface);
+
+ if (!varlink_idl_interface_name_is_valid(interface->name))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Interface name '%s' is not valid, refusing.", interface->name);
+
+ for (const VarlinkSymbol *const *symbol = interface->symbols; *symbol; symbol++) {
+
+ if (!varlink_idl_symbol_name_is_valid((*symbol)->name))
+ return log_full_errno(level, SYNTHETIC_ERRNO(EUCLEAN), "Symbol name '%s' is not valid, refusing.", strempty((*symbol)->name));
+
+ if (set_contains(name_set, (*symbol)->name))
+ return log_full_errno(level, SYNTHETIC_ERRNO(ENOTUNIQ), "Symbol '%s' defined twice in interface, refusing.", (*symbol)->name);
+
+ if (set_ensure_put(&name_set, &string_hash_ops, (*symbol)->name) < 0)
+ return log_oom();
+
+ r = varlink_idl_symbol_consistent(interface, *symbol, level);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int varlink_idl_validate_symbol(const VarlinkSymbol *symbol, JsonVariant *v, VarlinkFieldDirection direction, const char **bad_field);
+
+static int varlink_idl_validate_field_element_type(const VarlinkField *field, JsonVariant *v) {
+ assert(field);
+
+ switch (field->field_type) {
+
+ case VARLINK_STRUCT:
+ case VARLINK_ENUM:
+ case VARLINK_NAMED_TYPE:
+ return varlink_idl_validate_symbol(field->symbol, v, VARLINK_REGULAR, NULL);
+
+ case VARLINK_BOOL:
+ if (!json_variant_is_boolean(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be a bool, but it is not, refusing.", strna(field->name));
+
+ break;
+
+ case VARLINK_INT:
+ if (!json_variant_is_integer(v) && !json_variant_is_unsigned(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an int, but it is not, refusing.", strna(field->name));
+
+ break;
+
+ case VARLINK_FLOAT:
+ if (!json_variant_is_number(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be a float, but it is not, refusing.", strna(field->name));
+
+ break;
+
+ case VARLINK_STRING:
+ if (!json_variant_is_string(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be a string, but it is not, refusing.", strna(field->name));
+
+ break;
+
+ case VARLINK_OBJECT:
+ if (!json_variant_is_object(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an object, but it is not, refusing.", strna(field->name));
+
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return 0;
+}
+
+static int varlink_idl_validate_field(const VarlinkField *field, JsonVariant *v) {
+ int r;
+
+ assert(field);
+
+ if (!v || json_variant_is_null(v)) {
+
+ if (!FLAGS_SET(field->field_flags, VARLINK_NULLABLE))
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOANO), "Mandatory field '%s' is null or missing on object, refusing.", strna(field->name));
+
+ } else if (FLAGS_SET(field->field_flags, VARLINK_ARRAY)) {
+ JsonVariant *i;
+
+ if (!json_variant_is_array(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an array, but it is not, refusing.", strna(field->name));
+
+ JSON_VARIANT_ARRAY_FOREACH(i, v) {
+ r = varlink_idl_validate_field_element_type(field, i);
+ if (r < 0)
+ return r;
+ }
+
+ } else if (FLAGS_SET(field->field_flags, VARLINK_MAP)) {
+ _unused_ const char *k;
+ JsonVariant *e;
+
+ if (!json_variant_is_object(v))
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Field '%s' should be an object, but it is not, refusing.", strna(field->name));
+
+ JSON_VARIANT_OBJECT_FOREACH(k, e, v) {
+ r = varlink_idl_validate_field_element_type(field, e);
+ if (r < 0)
+ return r;
+ }
+ } else {
+
+ r = varlink_idl_validate_field_element_type(field, v);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+
+static int varlink_idl_validate_symbol(const VarlinkSymbol *symbol, JsonVariant *v, VarlinkFieldDirection direction, const char **bad_field) {
+ int r;
+
+ assert(symbol);
+
+ if (!v) {
+ if (bad_field)
+ *bad_field = NULL;
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Null object passed, refusing.");
+ }
+
+ switch (symbol->symbol_type) {
+
+ case VARLINK_ENUM_TYPE: {
+ bool found = false;
+ const char *s;
+
+ if (!json_variant_is_string(v)) {
+ if (bad_field)
+ *bad_field = symbol->name;
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Passed non-string to enum field '%s', refusing.", strna(symbol->name));
+ }
+
+ assert_se(s = json_variant_string(v));
+
+ for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) {
+
+ assert(field->field_type == VARLINK_ENUM_VALUE);
+
+ if (streq_ptr(field->name, s)) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (bad_field)
+ *bad_field = s;
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Passed unrecognized string '%s' to enum field '%s', refusing.", s, strna(symbol->name));
+ }
+
+ break;
+ }
+
+ case VARLINK_STRUCT_TYPE:
+ case VARLINK_METHOD:
+ case VARLINK_ERROR: {
+ if (!json_variant_is_object(v)) {
+ if (bad_field)
+ *bad_field = symbol->name;
+ return log_debug_errno(SYNTHETIC_ERRNO(EMEDIUMTYPE), "Passed non-object to field '%s', refusing.", strna(symbol->name));
+ }
+
+ for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++) {
+
+ if (field->field_direction != direction)
+ continue;
+
+ r = varlink_idl_validate_field(field, json_variant_by_key(v, field->name));
+ if (r < 0) {
+ if (bad_field)
+ *bad_field = field->name;
+ return r;
+ }
+ }
+
+ _unused_ JsonVariant *e;
+ const char *name;
+ JSON_VARIANT_OBJECT_FOREACH(name, e, v) {
+ if (!varlink_idl_find_field(symbol, name)) {
+ if (bad_field)
+ *bad_field = name;
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "Field '%s' not defined for object, refusing.", name);
+ }
+ }
+
+ break;
+ }
+
+ default:
+ assert_not_reached();
+ }
+
+ return 1; /* validated */
+}
+
+static int varlink_idl_validate_method(const VarlinkSymbol *method, JsonVariant *v, VarlinkFieldDirection direction, const char **bad_field) {
+ assert(IN_SET(direction, VARLINK_INPUT, VARLINK_OUTPUT));
+
+ if (!method)
+ return 0; /* Can't validate */
+ if (method->symbol_type != VARLINK_METHOD)
+ return -EBADMSG;
+
+ return varlink_idl_validate_symbol(method, v, direction, bad_field);
+}
+
+int varlink_idl_validate_method_call(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field) {
+ return varlink_idl_validate_method(method, v, VARLINK_INPUT, bad_field);
+}
+
+int varlink_idl_validate_method_reply(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field) {
+ return varlink_idl_validate_method(method, v, VARLINK_OUTPUT, bad_field);
+}
+
+int varlink_idl_validate_error(const VarlinkSymbol *error, JsonVariant *v, const char **bad_field) {
+ if (!error)
+ return 0; /* Can't validate */
+ if (error->symbol_type != VARLINK_ERROR)
+ return -EBADMSG;
+
+ return varlink_idl_validate_symbol(error, v, VARLINK_REGULAR, bad_field);
+}
+
+const VarlinkSymbol* varlink_idl_find_symbol(
+ const VarlinkInterface *interface,
+ VarlinkSymbolType type,
+ const char *name) {
+
+ assert(interface);
+ assert(type < _VARLINK_SYMBOL_TYPE_MAX);
+
+ if (isempty(name))
+ return NULL;
+
+ for (const VarlinkSymbol *const*symbol = interface->symbols; *symbol; symbol++) {
+ if (type >= 0 && (*symbol)->symbol_type != type)
+ continue;
+
+ if (streq_ptr((*symbol)->name, name))
+ return *symbol;
+ }
+
+ return NULL;
+}
+
+const VarlinkField* varlink_idl_find_field(
+ const VarlinkSymbol *symbol,
+ const char *name) {
+
+ assert(symbol);
+
+ if (isempty(name))
+ return NULL;
+
+ for (const VarlinkField *field = symbol->fields; field->field_type != _VARLINK_FIELD_TYPE_END_MARKER; field++)
+ if (streq_ptr(field->name, name))
+ return field;
+
+ return NULL;
+}
diff --git a/src/shared/varlink-idl.h b/src/shared/varlink-idl.h
new file mode 100644
index 0000000..140b937
--- /dev/null
+++ b/src/shared/varlink-idl.h
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "json.h"
+#include "macro.h"
+
+/* This implements the Varlink Interface Definition Language ("Varlink IDL"),
+ * i.e. https://varlink.org/Interface-Definition
+ *
+ * Primarily allows encoding static interface definitions in C code, that can be converted to the textual IDL
+ * format on-the-fly. Can also parse the textual format back to C structures. Validates the interface
+ * definitions for internal consistency and validates JSON objects against the interface definitions. */
+
+typedef enum VarlinkSymbolType {
+ VARLINK_ENUM_TYPE,
+ VARLINK_STRUCT_TYPE,
+ VARLINK_METHOD,
+ VARLINK_ERROR,
+ _VARLINK_SYMBOL_TYPE_MAX,
+ _VARLINK_SYMBOL_TYPE_INVALID = -EINVAL,
+} VarlinkSymbolType;
+
+typedef enum VarlinkFieldType {
+ _VARLINK_FIELD_TYPE_END_MARKER = 0, /* zero type means: this is the last entry in the fields[] array of VarlinkSymbol */
+ VARLINK_STRUCT,
+ VARLINK_ENUM,
+ VARLINK_NAMED_TYPE,
+ VARLINK_BOOL,
+ VARLINK_INT,
+ VARLINK_FLOAT,
+ VARLINK_STRING,
+ VARLINK_OBJECT,
+ VARLINK_ENUM_VALUE,
+ _VARLINK_FIELD_TYPE_MAX,
+ _VARLINK_FIELD_TYPE_INVALID = -EINVAL,
+} VarlinkFieldType;
+
+typedef enum VarlinkFieldDirection {
+ VARLINK_REGULAR,
+ VARLINK_INPUT,
+ VARLINK_OUTPUT,
+ _VARLINK_FIELD_DIRECTION_MAX,
+ _VARLINK_FIELD_DIRECTION_INVALID = -EINVAL,
+} VarlinkFieldDirection;
+
+typedef enum VarlinkFieldFlags {
+ VARLINK_ARRAY = 1 << 0,
+ VARLINK_MAP = 1 << 1,
+ VARLINK_NULLABLE = 1 << 2,
+ _VARLINK_FIELD_FLAGS_MAX = (1 << 3) - 1,
+ _VARLINK_FIELD_FLAGS_INVALID = -EINVAL,
+} VarlinkFieldFlags;
+
+typedef struct VarlinkField VarlinkField;
+typedef struct VarlinkSymbol VarlinkSymbol;
+typedef struct VarlinkInterface VarlinkInterface;
+
+/* Fields are the components making up symbols */
+struct VarlinkField {
+ const char *name;
+ VarlinkFieldType field_type;
+ VarlinkFieldFlags field_flags;
+ VarlinkFieldDirection field_direction; /* in case of method call fields: whether input or output argument */
+ const VarlinkSymbol *symbol; /* VARLINK_STRUCT, VARLINK_ENUM: anonymous symbol that carries the definitions, VARLINK_NAMED_TYPE: resolved symbol */
+ const char *named_type; /* VARLINK_NAMED_TYPE */
+};
+
+/* Symbols are primary named concepts in an interface, and are methods, errors or named types (either enum or struct). */
+struct VarlinkSymbol {
+ const char *name; /* most symbols have a name, but sometimes they are created on-the-fly for fields, in which case they are anonymous */
+ VarlinkSymbolType symbol_type;
+ VarlinkField fields[];
+};
+
+/* An interface definition has a name and consist of symbols */
+struct VarlinkInterface {
+ const char *name;
+ const VarlinkSymbol *symbols[];
+};
+
+#define VARLINK_DEFINE_FIELD(_name, _field_type, _field_flags) \
+ { .name = #_name, .field_type = (_field_type), .field_flags = (_field_flags) }
+
+#define VARLINK_DEFINE_FIELD_BY_TYPE(_name, _named_type, _field_flags) \
+ { .name = #_name, .field_type = VARLINK_NAMED_TYPE, .named_type = #_named_type, .symbol = &vl_type_ ## _named_type, .field_flags = (_field_flags) }
+
+#define VARLINK_DEFINE_INPUT(_name, _field_type, _field_flags) \
+ { .name = #_name, .field_type = (_field_type), .field_flags = (_field_flags), .field_direction = VARLINK_INPUT }
+
+#define VARLINK_DEFINE_INPUT_BY_TYPE(_name, _named_type, _field_flags) \
+ { .name = #_name, .field_type = VARLINK_NAMED_TYPE, .named_type = #_named_type, .symbol = &vl_type_ ## _named_type, .field_flags = (_field_flags), .field_direction = VARLINK_INPUT }
+
+#define VARLINK_DEFINE_OUTPUT(_name, _field_type, _field_flags) \
+ { .name = #_name, .field_type = (_field_type), .field_flags = (_field_flags), .field_direction = VARLINK_OUTPUT }
+
+#define VARLINK_DEFINE_OUTPUT_BY_TYPE(_name, _named_type, _field_flags) \
+ { .name = #_name, .field_type = VARLINK_NAMED_TYPE, .named_type = #_named_type, .symbol = &vl_type_ ## _named_type, .field_flags = (_field_flags), .field_direction = VARLINK_OUTPUT }
+
+#define VARLINK_DEFINE_ENUM_VALUE(_name) \
+ { .name = #_name, .field_type = VARLINK_ENUM_VALUE }
+
+#define VARLINK_DEFINE_METHOD(_name, ...) \
+ const VarlinkSymbol vl_method_ ## _name = { \
+ .name = #_name, \
+ .symbol_type = VARLINK_METHOD, \
+ .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \
+ }
+
+#define VARLINK_DEFINE_ERROR(_name, ...) \
+ const VarlinkSymbol vl_error_ ## _name = { \
+ .name = #_name, \
+ .symbol_type = VARLINK_ERROR, \
+ .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \
+ }
+
+#define VARLINK_DEFINE_STRUCT_TYPE(_name, ...) \
+ const VarlinkSymbol vl_type_ ## _name = { \
+ .name = #_name, \
+ .symbol_type = VARLINK_STRUCT_TYPE, \
+ .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \
+ }
+
+#define VARLINK_DEFINE_ENUM_TYPE(_name, ...) \
+ const VarlinkSymbol vl_type_ ## _name = { \
+ .name = #_name, \
+ .symbol_type = VARLINK_ENUM_TYPE, \
+ .fields = { __VA_ARGS__ __VA_OPT__(,) {}}, \
+ }
+
+#define VARLINK_DEFINE_INTERFACE(_name, _full_name, ...) \
+ const VarlinkInterface vl_interface_ ## _name = { \
+ .name = (_full_name), \
+ .symbols = { __VA_ARGS__ __VA_OPT__(,) NULL}, \
+ }
+
+int varlink_idl_dump(FILE *f, int use_colors, const VarlinkInterface *interface);
+int varlink_idl_format(const VarlinkInterface *interface, char **ret);
+
+int varlink_idl_parse(const char *text, unsigned *ret_line, unsigned *ret_column, VarlinkInterface **ret);
+VarlinkInterface* varlink_interface_free(VarlinkInterface *interface);
+DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkInterface*, varlink_interface_free);
+
+bool varlink_idl_field_name_is_valid(const char *name);
+bool varlink_idl_symbol_name_is_valid(const char *name);
+bool varlink_idl_interface_name_is_valid(const char *name);
+
+int varlink_idl_consistent(const VarlinkInterface *interface, int level);
+
+const VarlinkSymbol* varlink_idl_find_symbol(const VarlinkInterface *interface, VarlinkSymbolType type, const char *name);
+const VarlinkField* varlink_idl_find_field(const VarlinkSymbol *symbol, const char *name);
+
+int varlink_idl_validate_method_call(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field);
+int varlink_idl_validate_method_reply(const VarlinkSymbol *method, JsonVariant *v, const char **bad_field);
+int varlink_idl_validate_error(const VarlinkSymbol *error, JsonVariant *v, const char **bad_field);
diff --git a/src/shared/varlink-internal.h b/src/shared/varlink-internal.h
new file mode 100644
index 0000000..715202a
--- /dev/null
+++ b/src/shared/varlink-internal.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdio.h>
+
+#include "fdset.h"
+#include "varlink.h"
+
+int varlink_server_serialize(VarlinkServer *s, FILE *f, FDSet *fds);
+int varlink_server_deserialize_one(VarlinkServer *s, const char *value, FDSet *fds);
diff --git a/src/shared/varlink-io.systemd.Journal.c b/src/shared/varlink-io.systemd.Journal.c
new file mode 100644
index 0000000..b93fb72
--- /dev/null
+++ b/src/shared/varlink-io.systemd.Journal.c
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.Journal.h"
+
+static VARLINK_DEFINE_METHOD(Synchronize);
+static VARLINK_DEFINE_METHOD(Rotate);
+static VARLINK_DEFINE_METHOD(FlushToVar);
+static VARLINK_DEFINE_METHOD(RelinquishVar);
+
+static VARLINK_DEFINE_ERROR(NotSupportedByNamespaces);
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_Journal,
+ "io.systemd.Journal",
+ &vl_method_Synchronize,
+ &vl_method_Rotate,
+ &vl_method_FlushToVar,
+ &vl_method_RelinquishVar,
+ &vl_error_NotSupportedByNamespaces);
diff --git a/src/shared/varlink-io.systemd.Journal.h b/src/shared/varlink-io.systemd.Journal.h
new file mode 100644
index 0000000..0bc94a7
--- /dev/null
+++ b/src/shared/varlink-io.systemd.Journal.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_Journal;
diff --git a/src/shared/varlink-io.systemd.ManagedOOM.c b/src/shared/varlink-io.systemd.ManagedOOM.c
new file mode 100644
index 0000000..d6414b3
--- /dev/null
+++ b/src/shared/varlink-io.systemd.ManagedOOM.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.ManagedOOM.h"
+
+/* Pull in vl_type_ControlGroup, since both interfaces need it */
+#include "varlink-io.systemd.oom.h"
+
+/* This is PID1's Varlink service, where PID 1 is the server and oomd is the client.
+ *
+ * Compare with io.systemd.oom where the client/server roles of oomd and the service manager are swapped! */
+
+static VARLINK_DEFINE_METHOD(
+ SubscribeManagedOOMCGroups,
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(cgroups, ControlGroup, VARLINK_ARRAY));
+
+static VARLINK_DEFINE_ERROR(SubscriptionTaken);
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_ManagedOOM,
+ "io.systemd.ManagedOOM",
+ &vl_method_SubscribeManagedOOMCGroups,
+ &vl_type_ControlGroup,
+ &vl_error_SubscriptionTaken);
diff --git a/src/shared/varlink-io.systemd.ManagedOOM.h b/src/shared/varlink-io.systemd.ManagedOOM.h
new file mode 100644
index 0000000..2c8bf54
--- /dev/null
+++ b/src/shared/varlink-io.systemd.ManagedOOM.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_ManagedOOM;
diff --git a/src/shared/varlink-io.systemd.PCRExtend.c b/src/shared/varlink-io.systemd.PCRExtend.c
new file mode 100644
index 0000000..37d403f
--- /dev/null
+++ b/src/shared/varlink-io.systemd.PCRExtend.c
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.PCRExtend.h"
+
+static VARLINK_DEFINE_METHOD(
+ Extend,
+ VARLINK_DEFINE_INPUT(pcr, VARLINK_INT, 0),
+ VARLINK_DEFINE_INPUT(text, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(data, VARLINK_STRING, VARLINK_NULLABLE));
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_PCRExtend,
+ "io.systemd.PCRExtend",
+ &vl_method_Extend);
diff --git a/src/shared/varlink-io.systemd.PCRExtend.h b/src/shared/varlink-io.systemd.PCRExtend.h
new file mode 100644
index 0000000..ffc075a
--- /dev/null
+++ b/src/shared/varlink-io.systemd.PCRExtend.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_PCRExtend;
diff --git a/src/shared/varlink-io.systemd.Resolve.Monitor.c b/src/shared/varlink-io.systemd.Resolve.Monitor.c
new file mode 100644
index 0000000..d95b613
--- /dev/null
+++ b/src/shared/varlink-io.systemd.Resolve.Monitor.c
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.Resolve.Monitor.h"
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ ResourceKey,
+ VARLINK_DEFINE_FIELD(class, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(type, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(name, VARLINK_STRING, 0));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ ResourceRecord,
+ VARLINK_DEFINE_FIELD_BY_TYPE(key, ResourceKey, 0),
+ VARLINK_DEFINE_FIELD(priority, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(weight, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(port, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(name, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(cpu, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(os, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(items, VARLINK_STRING, VARLINK_NULLABLE|VARLINK_ARRAY),
+ VARLINK_DEFINE_FIELD(address, VARLINK_INT, VARLINK_NULLABLE|VARLINK_ARRAY),
+ VARLINK_DEFINE_FIELD(mname, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(rname, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(serial, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(refresh, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(expire, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(minimum, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(exchange, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(version, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(size, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(horiz_pre, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(vert_pre, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(latitude, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(longitude, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(altitude, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(keyTag, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(algorithm, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(digestType, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(digest, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(fptype, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(fingerprint, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(flags, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(protocol, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(dnskey, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(signer, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(typeCovered, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(labels, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(originalTtl, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(expiration, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(inception, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(signature, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(nextDomain, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(types, VARLINK_INT, VARLINK_NULLABLE|VARLINK_ARRAY),
+ VARLINK_DEFINE_FIELD(iterations, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(salt, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(hash, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(certUsage, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(selector, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(matchingType, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(data, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(tag, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(value, VARLINK_STRING, VARLINK_NULLABLE));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ ResourceRecordArray,
+ VARLINK_DEFINE_FIELD_BY_TYPE(rr, ResourceRecord, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(raw, VARLINK_STRING, 0));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ Answer,
+ VARLINK_DEFINE_FIELD_BY_TYPE(rr, ResourceRecord, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(raw, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE));
+
+VARLINK_DEFINE_METHOD(
+ SubscribeQueryResults,
+ /* First reply */
+ VARLINK_DEFINE_OUTPUT(ready, VARLINK_BOOL, VARLINK_NULLABLE),
+ /* Subsequent replies */
+ VARLINK_DEFINE_OUTPUT(state, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(rcode, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(errno, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(question, ResourceKey, VARLINK_NULLABLE|VARLINK_ARRAY),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(collectedQuestions, ResourceKey, VARLINK_NULLABLE|VARLINK_ARRAY),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(answer, Answer, VARLINK_NULLABLE|VARLINK_ARRAY));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ CacheEntry,
+ VARLINK_DEFINE_FIELD_BY_TYPE(key, ResourceKey, 0),
+ VARLINK_DEFINE_FIELD_BY_TYPE(rrs, ResourceRecordArray, VARLINK_NULLABLE|VARLINK_ARRAY),
+ VARLINK_DEFINE_FIELD(type, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(until, VARLINK_INT, 0));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ ScopeCache,
+ VARLINK_DEFINE_FIELD(protocol, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(family, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(ifname, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD_BY_TYPE(cache, CacheEntry, VARLINK_ARRAY));
+
+VARLINK_DEFINE_METHOD(
+ DumpCache,
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(dump, ScopeCache, VARLINK_ARRAY));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ ServerState,
+ VARLINK_DEFINE_FIELD(Server, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(Type, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(Interface, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(InterfaceIndex, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(VerifiedFeatureLevel, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(PossibleFeatureLevel, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(DNSSECMode, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(DNSSECSupported, VARLINK_BOOL, 0),
+ VARLINK_DEFINE_FIELD(ReceivedUDPFragmentMax, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(FailedUDPAttempts, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(FailedTCPAttempts, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(PacketTruncated, VARLINK_BOOL, 0),
+ VARLINK_DEFINE_FIELD(PacketBadOpt, VARLINK_BOOL, 0),
+ VARLINK_DEFINE_FIELD(PacketRRSIGMissing, VARLINK_BOOL, 0),
+ VARLINK_DEFINE_FIELD(PacketInvalid, VARLINK_BOOL, 0),
+ VARLINK_DEFINE_FIELD(PacketDoOff, VARLINK_BOOL, 0));
+
+VARLINK_DEFINE_METHOD(
+ DumpServerState,
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(dump, ServerState, VARLINK_ARRAY));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ TransactionStatistics,
+ VARLINK_DEFINE_FIELD(currentTransactions, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(totalTransactions, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(totalTimeouts, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(totalTimeoutsServedStale, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(totalFailedResponses, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(totalFailedResponsesServedStale, VARLINK_INT, 0));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ CacheStatistics,
+ VARLINK_DEFINE_FIELD(size, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(hits, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(misses, VARLINK_INT, 0));
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ DnssecStatistics,
+ VARLINK_DEFINE_FIELD(secure, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(insecure, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(bogus, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(indeterminate, VARLINK_INT, 0));
+
+VARLINK_DEFINE_METHOD(
+ DumpStatistics,
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(transactions, TransactionStatistics, 0),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(cache, CacheStatistics, 0),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(dnssec, DnssecStatistics, 0));
+
+VARLINK_DEFINE_METHOD(ResetStatistics);
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_Resolve_Monitor,
+ "io.systemd.Resolve.Monitor",
+ &vl_method_SubscribeQueryResults,
+ &vl_method_DumpCache,
+ &vl_method_DumpServerState,
+ &vl_method_DumpStatistics,
+ &vl_method_ResetStatistics,
+ &vl_type_ResourceKey,
+ &vl_type_ResourceRecord,
+ &vl_type_ResourceRecordArray,
+ &vl_type_Answer,
+ &vl_type_CacheEntry,
+ &vl_type_ScopeCache,
+ &vl_type_TransactionStatistics,
+ &vl_type_CacheStatistics,
+ &vl_type_DnssecStatistics,
+ &vl_type_ServerState);
diff --git a/src/shared/varlink-io.systemd.Resolve.Monitor.h b/src/shared/varlink-io.systemd.Resolve.Monitor.h
new file mode 100644
index 0000000..a133ec3
--- /dev/null
+++ b/src/shared/varlink-io.systemd.Resolve.Monitor.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_Resolve_Monitor;
diff --git a/src/shared/varlink-io.systemd.Resolve.c b/src/shared/varlink-io.systemd.Resolve.c
new file mode 100644
index 0000000..0d8ad28
--- /dev/null
+++ b/src/shared/varlink-io.systemd.Resolve.c
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.Resolve.h"
+
+static VARLINK_DEFINE_STRUCT_TYPE(
+ ResolvedAddress,
+ VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(family, VARLINK_INT, 0),
+ VARLINK_DEFINE_FIELD(address, VARLINK_INT, VARLINK_ARRAY));
+
+static VARLINK_DEFINE_METHOD(
+ ResolveHostname,
+ VARLINK_DEFINE_INPUT(ifindex, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(name, VARLINK_STRING, 0),
+ VARLINK_DEFINE_INPUT(family, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(flags, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(addresses, ResolvedAddress, VARLINK_ARRAY),
+ VARLINK_DEFINE_OUTPUT(name, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(flags, VARLINK_INT, 0));
+
+static VARLINK_DEFINE_STRUCT_TYPE(
+ ResolvedName,
+ VARLINK_DEFINE_FIELD(ifindex, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_FIELD(name, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_METHOD(
+ ResolveAddress,
+ VARLINK_DEFINE_INPUT(ifindex, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(family, VARLINK_INT, 0),
+ VARLINK_DEFINE_INPUT(address, VARLINK_INT, VARLINK_ARRAY),
+ VARLINK_DEFINE_INPUT(flags, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(names, ResolvedName, VARLINK_ARRAY),
+ VARLINK_DEFINE_OUTPUT(flags, VARLINK_INT, 0));
+
+static VARLINK_DEFINE_ERROR(NoNameServers);
+static VARLINK_DEFINE_ERROR(NoSuchResourceRecord);
+static VARLINK_DEFINE_ERROR(QueryTimedOut);
+static VARLINK_DEFINE_ERROR(MaxAttemptsReached);
+static VARLINK_DEFINE_ERROR(InvalidReply);
+static VARLINK_DEFINE_ERROR(QueryAborted);
+static VARLINK_DEFINE_ERROR(
+ DNSSECValidationFailed,
+ VARLINK_DEFINE_FIELD(result, VARLINK_STRING, 0));
+static VARLINK_DEFINE_ERROR(NoTrustAnchor);
+static VARLINK_DEFINE_ERROR(ResourceRecordTypeUnsupported);
+static VARLINK_DEFINE_ERROR(NetworkDown);
+static VARLINK_DEFINE_ERROR(NoSource);
+static VARLINK_DEFINE_ERROR(StubLoop);
+static VARLINK_DEFINE_ERROR(
+ DNSError,
+ VARLINK_DEFINE_FIELD(rcode, VARLINK_INT, 0));
+static VARLINK_DEFINE_ERROR(CNAMELoop);
+static VARLINK_DEFINE_ERROR(BadAddressSize);
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_Resolve,
+ "io.systemd.Resolve",
+ &vl_method_ResolveHostname,
+ &vl_method_ResolveAddress,
+ &vl_type_ResolvedAddress,
+ &vl_type_ResolvedName,
+ &vl_error_NoNameServers,
+ &vl_error_NoSuchResourceRecord,
+ &vl_error_QueryTimedOut,
+ &vl_error_MaxAttemptsReached,
+ &vl_error_InvalidReply,
+ &vl_error_QueryAborted,
+ &vl_error_DNSSECValidationFailed,
+ &vl_error_NoTrustAnchor,
+ &vl_error_ResourceRecordTypeUnsupported,
+ &vl_error_NetworkDown,
+ &vl_error_NoSource,
+ &vl_error_StubLoop,
+ &vl_error_DNSError,
+ &vl_error_CNAMELoop,
+ &vl_error_BadAddressSize);
diff --git a/src/shared/varlink-io.systemd.Resolve.h b/src/shared/varlink-io.systemd.Resolve.h
new file mode 100644
index 0000000..5c7ed39
--- /dev/null
+++ b/src/shared/varlink-io.systemd.Resolve.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_Resolve;
diff --git a/src/shared/varlink-io.systemd.UserDatabase.c b/src/shared/varlink-io.systemd.UserDatabase.c
new file mode 100644
index 0000000..c10a7d3
--- /dev/null
+++ b/src/shared/varlink-io.systemd.UserDatabase.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.UserDatabase.h"
+
+static VARLINK_DEFINE_METHOD(
+ GetUserRecord,
+ VARLINK_DEFINE_INPUT(uid, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(userName, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(service, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(record, VARLINK_OBJECT, 0),
+ VARLINK_DEFINE_OUTPUT(incomplete, VARLINK_BOOL, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_METHOD(
+ GetGroupRecord,
+ VARLINK_DEFINE_INPUT(gid, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(groupName, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(service, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(record, VARLINK_OBJECT, 0),
+ VARLINK_DEFINE_OUTPUT(incomplete, VARLINK_BOOL, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_METHOD(
+ GetMemberships,
+ VARLINK_DEFINE_INPUT(userName, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(groupName, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(service, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(userName, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(groupName, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(NoRecordFound);
+static VARLINK_DEFINE_ERROR(BadService);
+static VARLINK_DEFINE_ERROR(ServiceNotAvailable);
+static VARLINK_DEFINE_ERROR(ConflictingRecordNotFound);
+static VARLINK_DEFINE_ERROR(EnumerationNotSupported);
+
+/* As per https://systemd.io/USER_GROUP_API/ */
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_UserDatabase,
+ "io.systemd.UserDatabase",
+ &vl_method_GetUserRecord,
+ &vl_method_GetGroupRecord,
+ &vl_method_GetMemberships,
+ &vl_error_NoRecordFound,
+ &vl_error_BadService,
+ &vl_error_ServiceNotAvailable,
+ &vl_error_ConflictingRecordNotFound,
+ &vl_error_EnumerationNotSupported);
diff --git a/src/shared/varlink-io.systemd.UserDatabase.h b/src/shared/varlink-io.systemd.UserDatabase.h
new file mode 100644
index 0000000..346ca84
--- /dev/null
+++ b/src/shared/varlink-io.systemd.UserDatabase.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_UserDatabase;
diff --git a/src/shared/varlink-io.systemd.c b/src/shared/varlink-io.systemd.c
new file mode 100644
index 0000000..cdfe9ac
--- /dev/null
+++ b/src/shared/varlink-io.systemd.c
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.h"
+
+/* These are local errors that never cross the wire, and are our own invention */
+static VARLINK_DEFINE_ERROR(Disconnected);
+static VARLINK_DEFINE_ERROR(TimedOut);
+static VARLINK_DEFINE_ERROR(Protocol);
+
+/* This one we invented, and use for generically propagating system errors (errno) to clients */
+static VARLINK_DEFINE_ERROR(
+ System,
+ VARLINK_DEFINE_FIELD(errno, VARLINK_INT, 0));
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd,
+ "io.systemd",
+ &vl_error_Disconnected,
+ &vl_error_TimedOut,
+ &vl_error_Protocol,
+ &vl_error_System);
diff --git a/src/shared/varlink-io.systemd.h b/src/shared/varlink-io.systemd.h
new file mode 100644
index 0000000..6c17c6c
--- /dev/null
+++ b/src/shared/varlink-io.systemd.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd;
diff --git a/src/shared/varlink-io.systemd.oom.c b/src/shared/varlink-io.systemd.oom.c
new file mode 100644
index 0000000..e1da3fa
--- /dev/null
+++ b/src/shared/varlink-io.systemd.oom.c
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.oom.h"
+
+/* This is oomd's Varlink service, where oomd is server and systemd --user is the client.
+ *
+ * Compare with io.systemd.ManagedOOM where the client/server roles of the service manager and oomd are
+ * swapped! */
+
+VARLINK_DEFINE_STRUCT_TYPE(
+ ControlGroup,
+ VARLINK_DEFINE_FIELD(mode, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(path, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(property, VARLINK_STRING, 0),
+ VARLINK_DEFINE_FIELD(limit, VARLINK_INT, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_METHOD(
+ ReportManagedOOMCGroups,
+ VARLINK_DEFINE_INPUT_BY_TYPE(cgroups, ControlGroup, VARLINK_ARRAY));
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_oom,
+ "io.systemd.oom",
+ &vl_method_ReportManagedOOMCGroups,
+ &vl_type_ControlGroup);
diff --git a/src/shared/varlink-io.systemd.oom.h b/src/shared/varlink-io.systemd.oom.h
new file mode 100644
index 0000000..911dbc2
--- /dev/null
+++ b/src/shared/varlink-io.systemd.oom.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkSymbol vl_type_ControlGroup;
+extern const VarlinkInterface vl_interface_io_systemd_oom;
diff --git a/src/shared/varlink-io.systemd.service.c b/src/shared/varlink-io.systemd.service.c
new file mode 100644
index 0000000..e9df5de
--- /dev/null
+++ b/src/shared/varlink-io.systemd.service.c
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <unistd.h>
+
+#include "varlink-io.systemd.service.h"
+
+static VARLINK_DEFINE_METHOD(Ping);
+
+static VARLINK_DEFINE_METHOD(Reload);
+
+static VARLINK_DEFINE_METHOD(
+ SetLogLevel,
+ VARLINK_DEFINE_INPUT(level, VARLINK_INT, 0));
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_service,
+ "io.systemd.service",
+ &vl_method_Ping,
+ &vl_method_Reload,
+ &vl_method_SetLogLevel);
+
+int varlink_method_ping(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+ assert(link);
+
+ if (json_variant_elements(parameters) > 0)
+ return varlink_error_invalid_parameter(link, parameters);
+
+ log_debug("Received io.systemd.service.Ping");
+
+ return varlink_reply(link, NULL);
+}
+
+int varlink_method_set_log_level(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
+ static const JsonDispatch dispatch_table[] = {
+ { "level", _JSON_VARIANT_TYPE_INVALID, json_dispatch_int64, 0, JSON_MANDATORY },
+ {}
+ };
+
+ int64_t level;
+ uid_t uid;
+ int r;
+
+ assert(link);
+ assert(parameters);
+
+ /* NOTE: The method does have 1 parameter, but we must compare to 2 here, because
+ * json_variant_elements() breaks abstraction and exposes internal structure of JsonObject. */
+ if (json_variant_elements(parameters) != 2)
+ return varlink_error_invalid_parameter(link, parameters);
+
+ r = varlink_dispatch(link, parameters, dispatch_table, &level);
+ if (r != 0)
+ return r;
+
+ if (LOG_PRI(level) != level)
+ return varlink_error_invalid_parameter(link, parameters);
+
+ r = varlink_get_peer_uid(link, &uid);
+ if (r < 0)
+ return r;
+
+ if (uid != getuid() && uid != 0)
+ return varlink_error(link, VARLINK_ERROR_PERMISSION_DENIED, parameters);
+
+ log_debug("Received io.systemd.service.SetLogLevel(%" PRIi64 ")", level);
+
+ log_set_max_level(level);
+
+ return varlink_reply(link, NULL);
+}
diff --git a/src/shared/varlink-io.systemd.service.h b/src/shared/varlink-io.systemd.service.h
new file mode 100644
index 0000000..bc90ff0
--- /dev/null
+++ b/src/shared/varlink-io.systemd.service.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#pragma once
+
+#include "varlink.h"
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_service;
+
+int varlink_method_ping(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata);
+int varlink_method_set_log_level(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata);
diff --git a/src/shared/varlink-io.systemd.sysext.c b/src/shared/varlink-io.systemd.sysext.c
new file mode 100644
index 0000000..66e3534
--- /dev/null
+++ b/src/shared/varlink-io.systemd.sysext.c
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-io.systemd.sysext.h"
+
+static VARLINK_DEFINE_ENUM_TYPE(
+ ImageClass,
+ VARLINK_DEFINE_ENUM_VALUE(sysext),
+ VARLINK_DEFINE_ENUM_VALUE(confext));
+
+static VARLINK_DEFINE_ENUM_TYPE(
+ ImageType,
+ VARLINK_DEFINE_ENUM_VALUE(directory),
+ VARLINK_DEFINE_ENUM_VALUE(subvolume),
+ VARLINK_DEFINE_ENUM_VALUE(raw),
+ VARLINK_DEFINE_ENUM_VALUE(block));
+
+static VARLINK_DEFINE_METHOD(
+ Merge,
+ VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(force, VARLINK_BOOL, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(noReload, VARLINK_BOOL, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(noexec, VARLINK_BOOL, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_METHOD(
+ Unmerge,
+ VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(noReload, VARLINK_BOOL, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_METHOD(
+ Refresh,
+ VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(force, VARLINK_BOOL, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(noReload, VARLINK_BOOL, VARLINK_NULLABLE),
+ VARLINK_DEFINE_INPUT(noexec, VARLINK_BOOL, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_METHOD(
+ List,
+ VARLINK_DEFINE_INPUT_BY_TYPE(class, ImageClass, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(Class, ImageClass, 0),
+ VARLINK_DEFINE_OUTPUT_BY_TYPE(Type, ImageType, 0),
+ VARLINK_DEFINE_OUTPUT(Name, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(Path, VARLINK_STRING, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(ReadOnly, VARLINK_BOOL, 0),
+ VARLINK_DEFINE_OUTPUT(CreationTimestamp, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(ModificationTimestamp, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(Usage, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(UsageExclusive, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(Limit, VARLINK_INT, VARLINK_NULLABLE),
+ VARLINK_DEFINE_OUTPUT(LimitExclusive, VARLINK_INT, VARLINK_NULLABLE));
+
+static VARLINK_DEFINE_ERROR(NoImagesFound);
+
+static VARLINK_DEFINE_ERROR(
+ AlreadyMerged,
+ VARLINK_DEFINE_FIELD(hierarchy, VARLINK_STRING, 0));
+
+VARLINK_DEFINE_INTERFACE(
+ io_systemd_sysext,
+ "io.systemd.sysext",
+ &vl_type_ImageClass,
+ &vl_type_ImageType,
+ &vl_method_Merge,
+ &vl_method_Unmerge,
+ &vl_method_Refresh,
+ &vl_method_List,
+ &vl_error_NoImagesFound,
+ &vl_error_AlreadyMerged);
diff --git a/src/shared/varlink-io.systemd.sysext.h b/src/shared/varlink-io.systemd.sysext.h
new file mode 100644
index 0000000..ee649c6
--- /dev/null
+++ b/src/shared/varlink-io.systemd.sysext.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_io_systemd_sysext;
diff --git a/src/shared/varlink-org.varlink.service.c b/src/shared/varlink-org.varlink.service.c
new file mode 100644
index 0000000..e5122c0
--- /dev/null
+++ b/src/shared/varlink-org.varlink.service.c
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "varlink-org.varlink.service.h"
+
+static VARLINK_DEFINE_METHOD(
+ GetInfo,
+ VARLINK_DEFINE_OUTPUT(vendor, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(product, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(version, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(url, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(interfaces, VARLINK_STRING, VARLINK_ARRAY));
+
+static VARLINK_DEFINE_METHOD(
+ GetInterfaceDescription,
+ VARLINK_DEFINE_INPUT(interface, VARLINK_STRING, 0),
+ VARLINK_DEFINE_OUTPUT(description, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(
+ InterfaceNotFound,
+ VARLINK_DEFINE_FIELD(interface, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(
+ MethodNotFound,
+ VARLINK_DEFINE_FIELD(method, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(
+ MethodNotImplemented,
+ VARLINK_DEFINE_FIELD(method, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(
+ InvalidParameter,
+ VARLINK_DEFINE_FIELD(parameter, VARLINK_STRING, 0));
+
+static VARLINK_DEFINE_ERROR(PermissionDenied);
+
+static VARLINK_DEFINE_ERROR(ExpectedMore);
+
+/* As per https://varlink.org/Service */
+VARLINK_DEFINE_INTERFACE(
+ org_varlink_service,
+ "org.varlink.service",
+ &vl_method_GetInfo,
+ &vl_method_GetInterfaceDescription,
+ &vl_error_InterfaceNotFound,
+ &vl_error_MethodNotFound,
+ &vl_error_MethodNotImplemented,
+ &vl_error_InvalidParameter,
+ &vl_error_PermissionDenied,
+ &vl_error_ExpectedMore);
diff --git a/src/shared/varlink-org.varlink.service.h b/src/shared/varlink-org.varlink.service.h
new file mode 100644
index 0000000..75c55e6
--- /dev/null
+++ b/src/shared/varlink-org.varlink.service.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "varlink-idl.h"
+
+extern const VarlinkInterface vl_interface_org_varlink_service;
diff --git a/src/shared/varlink.c b/src/shared/varlink.c
new file mode 100644
index 0000000..749b644
--- /dev/null
+++ b/src/shared/varlink.c
@@ -0,0 +1,3767 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <malloc.h>
+#include <poll.h>
+
+#include <sd-daemon.h>
+
+#include "alloc-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "glyph-util.h"
+#include "hashmap.h"
+#include "io-util.h"
+#include "iovec-util.h"
+#include "list.h"
+#include "path-util.h"
+#include "process-util.h"
+#include "selinux-util.h"
+#include "serialize.h"
+#include "set.h"
+#include "socket-util.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "strv.h"
+#include "time-util.h"
+#include "umask-util.h"
+#include "user-util.h"
+#include "varlink.h"
+#include "varlink-internal.h"
+#include "varlink-org.varlink.service.h"
+#include "varlink-io.systemd.h"
+#include "version.h"
+
+#define VARLINK_DEFAULT_CONNECTIONS_MAX 4096U
+#define VARLINK_DEFAULT_CONNECTIONS_PER_UID_MAX 1024U
+
+#define VARLINK_DEFAULT_TIMEOUT_USEC (45U*USEC_PER_SEC)
+#define VARLINK_BUFFER_MAX (16U*1024U*1024U)
+#define VARLINK_READ_SIZE (64U*1024U)
+
+typedef enum VarlinkState {
+ /* Client side states */
+ VARLINK_IDLE_CLIENT,
+ VARLINK_AWAITING_REPLY,
+ VARLINK_AWAITING_REPLY_MORE,
+ VARLINK_CALLING,
+ VARLINK_CALLED,
+ VARLINK_PROCESSING_REPLY,
+
+ /* Server side states */
+ VARLINK_IDLE_SERVER,
+ VARLINK_PROCESSING_METHOD,
+ VARLINK_PROCESSING_METHOD_MORE,
+ VARLINK_PROCESSING_METHOD_ONEWAY,
+ VARLINK_PROCESSED_METHOD,
+ VARLINK_PENDING_METHOD,
+ VARLINK_PENDING_METHOD_MORE,
+
+ /* Common states (only during shutdown) */
+ VARLINK_PENDING_DISCONNECT,
+ VARLINK_PENDING_TIMEOUT,
+ VARLINK_PROCESSING_DISCONNECT,
+ VARLINK_PROCESSING_TIMEOUT,
+ VARLINK_PROCESSING_FAILURE,
+ VARLINK_DISCONNECTED,
+
+ _VARLINK_STATE_MAX,
+ _VARLINK_STATE_INVALID = -EINVAL,
+} VarlinkState;
+
+/* Tests whether we are not yet disconnected. Note that this is true during all states where the connection
+ * is still good for something, and false only when it's dead for good. This means: when we are
+ * asynchronously connecting to a peer and the connect() is still pending, then this will return 'true', as
+ * the connection is still good, and we are likely to be able to properly operate on it soon. */
+#define VARLINK_STATE_IS_ALIVE(state) \
+ IN_SET(state, \
+ VARLINK_IDLE_CLIENT, \
+ VARLINK_AWAITING_REPLY, \
+ VARLINK_AWAITING_REPLY_MORE, \
+ VARLINK_CALLING, \
+ VARLINK_CALLED, \
+ VARLINK_PROCESSING_REPLY, \
+ VARLINK_IDLE_SERVER, \
+ VARLINK_PROCESSING_METHOD, \
+ VARLINK_PROCESSING_METHOD_MORE, \
+ VARLINK_PROCESSING_METHOD_ONEWAY, \
+ VARLINK_PROCESSED_METHOD, \
+ VARLINK_PENDING_METHOD, \
+ VARLINK_PENDING_METHOD_MORE)
+
+typedef struct VarlinkJsonQueueItem VarlinkJsonQueueItem;
+
+/* A queued message we shall write into the socket, along with the file descriptors to send at the same
+ * time. This queue item binds them together so that message/fd boundaries are maintained throughout the
+ * whole pipeline. */
+struct VarlinkJsonQueueItem {
+ LIST_FIELDS(VarlinkJsonQueueItem, queue);
+ JsonVariant *data;
+ size_t n_fds;
+ int fds[];
+};
+
+struct Varlink {
+ unsigned n_ref;
+
+ VarlinkServer *server;
+
+ VarlinkState state;
+ bool connecting; /* This boolean indicates whether the socket fd we are operating on is currently
+ * processing an asynchronous connect(). In that state we watch the socket for
+ * EPOLLOUT, but we refrain from calling read() or write() on the socket as that
+ * will trigger ENOTCONN. Note that this boolean is kept separate from the
+ * VarlinkState above on purpose: while the connect() is still not complete we
+ * already want to allow queuing of messages and similar. Thus it's nice to keep
+ * these two state concepts separate: the VarlinkState encodes what our own view of
+ * the connection is, i.e. whether we think it's a server, a client, and has
+ * something queued already, while 'connecting' tells us a detail about the
+ * transport used below, that should have no effect on how we otherwise accept and
+ * process operations from the user.
+ *
+ * Or to say this differently: VARLINK_STATE_IS_ALIVE(state) tells you whether the
+ * connection is good to use, even if it might not be fully connected
+ * yet. connecting=true then informs you that actually we are still connecting, and
+ * the connection is actually not established yet and thus any requests you enqueue
+ * now will still work fine but will be queued only, not sent yet, but that
+ * shouldn't stop you from using the connection, since eventually whatever you queue
+ * *will* be sent.
+ *
+ * Or to say this even differently: 'state' is a high-level ("application layer"
+ * high, if you so will) state, while 'conecting' is a low-level ("transport layer"
+ * low, if you so will) state, and while they are not entirely unrelated and
+ * sometimes propagate effects to each other they are only asynchronously connected
+ * at most. */
+ unsigned n_pending;
+
+ int fd;
+
+ char *input_buffer; /* valid data starts at input_buffer_index, ends at input_buffer_index+input_buffer_size */
+ size_t input_buffer_index;
+ size_t input_buffer_size;
+ size_t input_buffer_unscanned;
+
+ void *input_control_buffer;
+ size_t input_control_buffer_size;
+
+ char *output_buffer; /* valid data starts at output_buffer_index, ends at output_buffer_index+output_buffer_size */
+ size_t output_buffer_index;
+ size_t output_buffer_size;
+
+ int *input_fds; /* file descriptors associated with the data in input_buffer (for fd passing) */
+ size_t n_input_fds;
+
+ int *output_fds; /* file descriptors associated with the data in output_buffer (for fd passing) */
+ size_t n_output_fds;
+
+ /* Further messages to output not yet formatted into text, and thus not included in output_buffer
+ * yet. We keep them separate from output_buffer, to not violate fd message boundaries: we want that
+ * each fd that is sent is associated with its fds, and that fds cannot be accidentally associated
+ * with preceding or following messages. */
+ LIST_HEAD(VarlinkJsonQueueItem, output_queue);
+ VarlinkJsonQueueItem *output_queue_tail;
+
+ /* The fds to associate with the next message that is about to be enqueued. The user first pushes the
+ * fds it intends to send via varlink_push_fd() into this queue, and then once the message data is
+ * submitted we'll combine the fds and the message data into one. */
+ int *pushed_fds;
+ size_t n_pushed_fds;
+
+ VarlinkReply reply_callback;
+
+ JsonVariant *current;
+ VarlinkSymbol *current_method;
+
+ struct ucred ucred;
+ bool ucred_acquired:1;
+
+ bool write_disconnected:1;
+ bool read_disconnected:1;
+ bool prefer_read_write:1;
+ bool got_pollhup:1;
+
+ bool allow_fd_passing_input:1;
+ bool allow_fd_passing_output:1;
+
+ bool output_buffer_sensitive:1; /* whether to erase the output buffer after writing it to the socket */
+
+ int af; /* address family if socket; AF_UNSPEC if not socket; negative if not known */
+
+ usec_t timestamp;
+ usec_t timeout;
+
+ void *userdata;
+ char *description;
+
+ sd_event *event;
+ sd_event_source *io_event_source;
+ sd_event_source *time_event_source;
+ sd_event_source *quit_event_source;
+ sd_event_source *defer_event_source;
+
+ pid_t exec_pid;
+};
+
+typedef struct VarlinkServerSocket VarlinkServerSocket;
+
+struct VarlinkServerSocket {
+ VarlinkServer *server;
+
+ int fd;
+ char *address;
+
+ sd_event_source *event_source;
+
+ LIST_FIELDS(VarlinkServerSocket, sockets);
+};
+
+struct VarlinkServer {
+ unsigned n_ref;
+ VarlinkServerFlags flags;
+
+ LIST_HEAD(VarlinkServerSocket, sockets);
+
+ Hashmap *methods; /* Fully qualified symbol name of a method → VarlinkMethod */
+ Hashmap *interfaces; /* Fully qualified interface name → VarlinkInterface* */
+ Hashmap *symbols; /* Fully qualified symbol name of method/error → VarlinkSymbol* */
+ VarlinkConnect connect_callback;
+ VarlinkDisconnect disconnect_callback;
+
+ sd_event *event;
+ int64_t event_priority;
+
+ unsigned n_connections;
+ Hashmap *by_uid; /* UID_TO_PTR(uid) → UINT_TO_PTR(n_connections) */
+
+ void *userdata;
+ char *description;
+
+ unsigned connections_max;
+ unsigned connections_per_uid_max;
+
+ bool exit_on_idle;
+};
+
+typedef struct VarlinkCollectContext {
+ JsonVariant *parameters;
+ const char *error_id;
+ VarlinkReplyFlags flags;
+} VarlinkCollectContext ;
+
+static const char* const varlink_state_table[_VARLINK_STATE_MAX] = {
+ [VARLINK_IDLE_CLIENT] = "idle-client",
+ [VARLINK_AWAITING_REPLY] = "awaiting-reply",
+ [VARLINK_AWAITING_REPLY_MORE] = "awaiting-reply-more",
+ [VARLINK_CALLING] = "calling",
+ [VARLINK_CALLED] = "called",
+ [VARLINK_PROCESSING_REPLY] = "processing-reply",
+ [VARLINK_IDLE_SERVER] = "idle-server",
+ [VARLINK_PROCESSING_METHOD] = "processing-method",
+ [VARLINK_PROCESSING_METHOD_MORE] = "processing-method-more",
+ [VARLINK_PROCESSING_METHOD_ONEWAY] = "processing-method-oneway",
+ [VARLINK_PROCESSED_METHOD] = "processed-method",
+ [VARLINK_PENDING_METHOD] = "pending-method",
+ [VARLINK_PENDING_METHOD_MORE] = "pending-method-more",
+ [VARLINK_PENDING_DISCONNECT] = "pending-disconnect",
+ [VARLINK_PENDING_TIMEOUT] = "pending-timeout",
+ [VARLINK_PROCESSING_DISCONNECT] = "processing-disconnect",
+ [VARLINK_PROCESSING_TIMEOUT] = "processing-timeout",
+ [VARLINK_PROCESSING_FAILURE] = "processing-failure",
+ [VARLINK_DISCONNECTED] = "disconnected",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(varlink_state, VarlinkState);
+
+#define varlink_log_errno(v, error, fmt, ...) \
+ log_debug_errno(error, "%s: " fmt, varlink_description(v), ##__VA_ARGS__)
+
+#define varlink_log(v, fmt, ...) \
+ log_debug("%s: " fmt, varlink_description(v), ##__VA_ARGS__)
+
+#define varlink_server_log_errno(s, error, fmt, ...) \
+ log_debug_errno(error, "%s: " fmt, varlink_server_description(s), ##__VA_ARGS__)
+
+#define varlink_server_log(s, fmt, ...) \
+ log_debug("%s: " fmt, varlink_server_description(s), ##__VA_ARGS__)
+
+static int varlink_format_queue(Varlink *v);
+static void varlink_server_test_exit_on_idle(VarlinkServer *s);
+
+static const char *varlink_description(Varlink *v) {
+ return (v ? v->description : NULL) ?: "varlink";
+}
+
+static const char *varlink_server_description(VarlinkServer *s) {
+ return (s ? s->description : NULL) ?: "varlink";
+}
+
+static VarlinkJsonQueueItem *varlink_json_queue_item_free(VarlinkJsonQueueItem *q) {
+ if (!q)
+ return NULL;
+
+ json_variant_unref(q->data);
+ close_many(q->fds, q->n_fds);
+
+ return mfree(q);
+}
+
+static VarlinkJsonQueueItem *varlink_json_queue_item_new(JsonVariant *m, const int fds[], size_t n_fds) {
+ VarlinkJsonQueueItem *q;
+
+ assert(m);
+ assert(fds || n_fds == 0);
+
+ q = malloc(offsetof(VarlinkJsonQueueItem, fds) + sizeof(int) * n_fds);
+ if (!q)
+ return NULL;
+
+ *q = (VarlinkJsonQueueItem) {
+ .data = json_variant_ref(m),
+ .n_fds = n_fds,
+ };
+
+ memcpy_safe(q->fds, fds, n_fds * sizeof(int));
+
+ return TAKE_PTR(q);
+}
+
+static void varlink_set_state(Varlink *v, VarlinkState state) {
+ assert(v);
+ assert(state >= 0 && state < _VARLINK_STATE_MAX);
+
+ if (v->state < 0)
+ varlink_log(v, "Setting state %s",
+ varlink_state_to_string(state));
+ else
+ varlink_log(v, "Changing state %s %s %s",
+ varlink_state_to_string(v->state),
+ special_glyph(SPECIAL_GLYPH_ARROW_RIGHT),
+ varlink_state_to_string(state));
+
+ v->state = state;
+}
+
+static int varlink_new(Varlink **ret) {
+ Varlink *v;
+
+ assert(ret);
+
+ v = new(Varlink, 1);
+ if (!v)
+ return -ENOMEM;
+
+ *v = (Varlink) {
+ .n_ref = 1,
+ .fd = -EBADF,
+
+ .state = _VARLINK_STATE_INVALID,
+
+ .ucred = UCRED_INVALID,
+
+ .timestamp = USEC_INFINITY,
+ .timeout = VARLINK_DEFAULT_TIMEOUT_USEC,
+
+ .af = -1,
+ };
+
+ *ret = v;
+ return 0;
+}
+
+int varlink_connect_address(Varlink **ret, const char *address) {
+ _cleanup_(varlink_unrefp) Varlink *v = NULL;
+ union sockaddr_union sockaddr;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ assert_return(address, -EINVAL);
+
+ r = varlink_new(&v);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create varlink object: %m");
+
+ v->fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (v->fd < 0)
+ return log_debug_errno(errno, "Failed to create AF_UNIX socket: %m");
+
+ v->fd = fd_move_above_stdio(v->fd);
+ v->af = AF_UNIX;
+
+ r = sockaddr_un_set_path(&sockaddr.un, address);
+ if (r < 0) {
+ if (r != -ENAMETOOLONG)
+ return log_debug_errno(r, "Failed to set socket address '%s': %m", address);
+
+ /* This is a file system path, and too long to fit into sockaddr_un. Let's connect via O_PATH
+ * to this socket. */
+
+ r = connect_unix_path(v->fd, AT_FDCWD, address);
+ } else
+ r = RET_NERRNO(connect(v->fd, &sockaddr.sa, r));
+
+ if (r < 0) {
+ if (!IN_SET(r, -EAGAIN, -EINPROGRESS))
+ return log_debug_errno(r, "Failed to connect to %s: %m", address);
+
+ v->connecting = true; /* We are asynchronously connecting, i.e. the connect() is being
+ * processed in the background. As long as that's the case the socket
+ * is in a special state: it's there, we can poll it for EPOLLOUT, but
+ * if we attempt to write() to it before we see EPOLLOUT we'll get
+ * ENOTCONN (and not EAGAIN, like we would for a normal connected
+ * socket that isn't writable at the moment). Since ENOTCONN on write()
+ * hence can mean two different things (i.e. connection not complete
+ * yet vs. already disconnected again), we store as a boolean whether
+ * we are still in connect(). */
+ }
+
+ varlink_set_state(v, VARLINK_IDLE_CLIENT);
+
+ *ret = TAKE_PTR(v);
+ return 0;
+}
+
+int varlink_connect_exec(Varlink **ret, const char *_command, char **_argv) {
+ _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
+ _cleanup_(sigkill_waitp) pid_t pid = 0;
+ _cleanup_free_ char *command = NULL;
+ _cleanup_strv_free_ char **argv = NULL;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ assert_return(_command, -EINVAL);
+
+ /* Copy the strings, in case they point into our own argv[], which we'll invalidate shortly because
+ * we rename the child process */
+ command = strdup(_command);
+ if (!command)
+ return -ENOMEM;
+
+ if (strv_isempty(_argv))
+ argv = strv_new(command);
+ else
+ argv = strv_copy(_argv);
+ if (!argv)
+ return -ENOMEM;
+
+ log_debug("Forking off Varlink child process '%s'.", command);
+
+ if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0, pair) < 0)
+ return log_debug_errno(errno, "Failed to allocate AF_UNIX socket pair: %m");
+
+ r = safe_fork_full(
+ "(sd-vlexec)",
+ /* stdio_fds= */ NULL,
+ /* except_fds= */ (int[]) { pair[1] },
+ /* n_except_fds= */ 1,
+ FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_REOPEN_LOG|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE,
+ &pid);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to spawn process: %m");
+ if (r == 0) {
+ char spid[DECIMAL_STR_MAX(pid_t)+1];
+ const char *setenv_list[] = {
+ "LISTEN_FDS", "1",
+ "LISTEN_PID", spid,
+ "LISTEN_FDNAMES", "varlink",
+ NULL, NULL,
+ };
+ /* Child */
+
+ pair[0] = -EBADF;
+
+ r = move_fd(pair[1], 3, /* cloexec= */ false);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to move file descriptor to 3: %m");
+ _exit(EXIT_FAILURE);
+ }
+
+ xsprintf(spid, PID_FMT, pid);
+
+ STRV_FOREACH_PAIR(a, b, setenv_list) {
+ if (setenv(*a, *b, /* override= */ true) < 0) {
+ log_debug_errno(errno, "Failed to set environment variable '%s': %m", *a);
+ _exit(EXIT_FAILURE);
+ }
+ }
+
+ execvp(command, argv);
+ log_debug_errno(r, "Failed to invoke process '%s': %m", command);
+ _exit(EXIT_FAILURE);
+ }
+
+ pair[1] = safe_close(pair[1]);
+
+ Varlink *v;
+ r = varlink_new(&v);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create varlink object: %m");
+
+ v->fd = TAKE_FD(pair[0]);
+ v->af = AF_UNIX;
+ v->exec_pid = TAKE_PID(pid);
+ varlink_set_state(v, VARLINK_IDLE_CLIENT);
+
+ *ret = v;
+ return 0;
+}
+
+int varlink_connect_url(Varlink **ret, const char *url) {
+ _cleanup_free_ char *c = NULL;
+ const char *p;
+ bool exec;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ assert_return(url, -EINVAL);
+
+ // FIXME: Add support for vsock:, ssh-exec:, ssh-unix: URL schemes here. (The latter with OpenSSH
+ // 9.4's -W switch for referencing remote AF_UNIX sockets.)
+
+ /* The Varlink URL scheme is a bit underdefined. We support only the unix: transport for now, plus an
+ * exec: transport we made up ourselves. Strictly speaking this shouldn't even be called URL, since
+ * it has nothing to do with Internet URLs by RFC. */
+
+ p = startswith(url, "unix:");
+ if (p)
+ exec = false;
+ else {
+ p = startswith(url, "exec:");
+ if (!p)
+ return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "URL scheme not supported.");
+
+ exec = true;
+ }
+
+ /* The varlink.org reference C library supports more than just file system paths. We might want to
+ * support that one day too. For now simply refuse that. */
+ if (p[strcspn(p, ";?#")] != '\0')
+ return log_debug_errno(SYNTHETIC_ERRNO(EPROTONOSUPPORT), "URL parameterization with ';', '?', '#' not supported.");
+
+ if (exec || p[0] != '@') { /* no validity checks for abstract namespace */
+
+ if (!path_is_absolute(p))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path not absolute, refusing.");
+
+ r = path_simplify_alloc(p, &c);
+ if (r < 0)
+ return r;
+
+ if (!path_is_normalized(c))
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Specified path is not normalized, refusing.");
+ }
+
+ if (exec)
+ return varlink_connect_exec(ret, c, NULL);
+
+ return varlink_connect_address(ret, c ?: p);
+}
+
+int varlink_connect_fd(Varlink **ret, int fd) {
+ Varlink *v;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ assert_return(fd >= 0, -EBADF);
+
+ r = fd_nonblock(fd, true);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to make fd %d nonblocking: %m", fd);
+
+ r = varlink_new(&v);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create varlink object: %m");
+
+ v->fd = fd;
+ v->af = -1,
+ varlink_set_state(v, VARLINK_IDLE_CLIENT);
+
+ /* Note that if this function is called we assume the passed socket (if it is one) is already
+ * properly connected, i.e. any asynchronous connect() done on it already completed. Because of that
+ * we'll not set the 'connecting' boolean here, i.e. we don't need to avoid write()ing to the socket
+ * until the connection is fully set up. Behaviour here is hence a bit different from
+ * varlink_connect_address() above, as there we do handle asynchronous connections ourselves and
+ * avoid doing write() on it before we saw EPOLLOUT for the first time. */
+
+ *ret = v;
+ return 0;
+}
+
+static void varlink_detach_event_sources(Varlink *v) {
+ assert(v);
+
+ v->io_event_source = sd_event_source_disable_unref(v->io_event_source);
+ v->time_event_source = sd_event_source_disable_unref(v->time_event_source);
+ v->quit_event_source = sd_event_source_disable_unref(v->quit_event_source);
+ v->defer_event_source = sd_event_source_disable_unref(v->defer_event_source);
+}
+
+static void varlink_clear_current(Varlink *v) {
+ assert(v);
+
+ /* Clears the currently processed incoming message */
+ v->current = json_variant_unref(v->current);
+ v->current_method = NULL;
+
+ close_many(v->input_fds, v->n_input_fds);
+ v->input_fds = mfree(v->input_fds);
+ v->n_input_fds = 0;
+}
+
+static void varlink_clear(Varlink *v) {
+ assert(v);
+
+ varlink_detach_event_sources(v);
+
+ v->fd = safe_close(v->fd);
+
+ varlink_clear_current(v);
+
+ v->input_buffer = mfree(v->input_buffer);
+ v->output_buffer = v->output_buffer_sensitive ? erase_and_free(v->output_buffer) : mfree(v->output_buffer);
+
+ v->input_control_buffer = mfree(v->input_control_buffer);
+ v->input_control_buffer_size = 0;
+
+ close_many(v->output_fds, v->n_output_fds);
+ v->output_fds = mfree(v->output_fds);
+ v->n_output_fds = 0;
+
+ close_many(v->pushed_fds, v->n_pushed_fds);
+ v->pushed_fds = mfree(v->pushed_fds);
+ v->n_pushed_fds = 0;
+
+ LIST_CLEAR(queue, v->output_queue, varlink_json_queue_item_free);
+ v->output_queue_tail = NULL;
+
+ v->event = sd_event_unref(v->event);
+
+ if (v->exec_pid > 0) {
+ sigterm_wait(v->exec_pid);
+ v->exec_pid = 0;
+ }
+}
+
+static Varlink* varlink_destroy(Varlink *v) {
+ if (!v)
+ return NULL;
+
+ /* If this is called the server object must already been unreffed here. Why that? because when we
+ * linked up the varlink connection with the server object we took one ref in each direction */
+ assert(!v->server);
+
+ varlink_clear(v);
+
+ free(v->description);
+ return mfree(v);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(Varlink, varlink, varlink_destroy);
+
+static int varlink_test_disconnect(Varlink *v) {
+ assert(v);
+
+ /* Tests whether we the connection has been terminated. We are careful to not stop processing it
+ * prematurely, since we want to handle half-open connections as well as possible and want to flush
+ * out and read data before we close down if we can. */
+
+ /* Already disconnected? */
+ if (!VARLINK_STATE_IS_ALIVE(v->state))
+ return 0;
+
+ /* Wait until connection setup is complete, i.e. until asynchronous connect() completes */
+ if (v->connecting)
+ return 0;
+
+ /* Still something to write and we can write? Stay around */
+ if (v->output_buffer_size > 0 && !v->write_disconnected)
+ return 0;
+
+ /* Both sides gone already? Then there's no need to stick around */
+ if (v->read_disconnected && v->write_disconnected)
+ goto disconnect;
+
+ /* If we are waiting for incoming data but the read side is shut down, disconnect. */
+ if (IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING, VARLINK_IDLE_SERVER) && v->read_disconnected)
+ goto disconnect;
+
+ /* Similar, if are a client that hasn't written anything yet but the write side is dead, also
+ * disconnect. We also explicitly check for POLLHUP here since we likely won't notice the write side
+ * being down if we never wrote anything. */
+ if (v->state == VARLINK_IDLE_CLIENT && (v->write_disconnected || v->got_pollhup))
+ goto disconnect;
+
+ /* We are on the server side and still want to send out more replies, but we saw POLLHUP already, and
+ * either got no buffered bytes to write anymore or already saw a write error. In that case we should
+ * shut down the varlink link. */
+ if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE) && (v->write_disconnected || v->output_buffer_size == 0) && v->got_pollhup)
+ goto disconnect;
+
+ return 0;
+
+disconnect:
+ varlink_set_state(v, VARLINK_PENDING_DISCONNECT);
+ return 1;
+}
+
+static int varlink_write(Varlink *v) {
+ ssize_t n;
+ int r;
+
+ assert(v);
+
+ if (!VARLINK_STATE_IS_ALIVE(v->state))
+ return 0;
+ if (v->connecting) /* Writing while we are still wait for a non-blocking connect() to complete will
+ * result in ENOTCONN, hence exit early here */
+ return 0;
+ if (v->write_disconnected)
+ return 0;
+
+ /* If needed let's convert some output queue json variants into text form */
+ r = varlink_format_queue(v);
+ if (r < 0)
+ return r;
+
+ if (v->output_buffer_size == 0)
+ return 0;
+
+ assert(v->fd >= 0);
+
+ if (v->n_output_fds > 0) { /* If we shall send fds along, we must use sendmsg() */
+ struct iovec iov = {
+ .iov_base = v->output_buffer + v->output_buffer_index,
+ .iov_len = v->output_buffer_size,
+ };
+ struct msghdr mh = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_controllen = CMSG_SPACE(sizeof(int) * v->n_output_fds),
+ };
+
+ mh.msg_control = alloca0(mh.msg_controllen);
+
+ struct cmsghdr *control = CMSG_FIRSTHDR(&mh);
+ control->cmsg_len = CMSG_LEN(sizeof(int) * v->n_output_fds);
+ control->cmsg_level = SOL_SOCKET;
+ control->cmsg_type = SCM_RIGHTS;
+ memcpy(CMSG_DATA(control), v->output_fds, sizeof(int) * v->n_output_fds);
+
+ n = sendmsg(v->fd, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
+ } else {
+ /* We generally prefer recv()/send() (mostly because of MSG_NOSIGNAL) but also want to be compatible
+ * with non-socket IO, hence fall back automatically.
+ *
+ * Use a local variable to help gcc figure out that we set 'n' in all cases. */
+ bool prefer_write = v->prefer_read_write;
+ if (!prefer_write) {
+ n = send(v->fd, v->output_buffer + v->output_buffer_index, v->output_buffer_size, MSG_DONTWAIT|MSG_NOSIGNAL);
+ if (n < 0 && errno == ENOTSOCK)
+ prefer_write = v->prefer_read_write = true;
+ }
+ if (prefer_write)
+ n = write(v->fd, v->output_buffer + v->output_buffer_index, v->output_buffer_size);
+ }
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+
+ if (ERRNO_IS_DISCONNECT(errno)) {
+ /* If we get informed about a disconnect on write, then let's remember that, but not
+ * act on it just yet. Let's wait for read() to report the issue first. */
+ v->write_disconnected = true;
+ return 1;
+ }
+
+ return -errno;
+ }
+
+ if (v->output_buffer_sensitive)
+ explicit_bzero_safe(v->output_buffer + v->output_buffer_index, n);
+
+ v->output_buffer_size -= n;
+
+ if (v->output_buffer_size == 0) {
+ v->output_buffer_index = 0;
+ v->output_buffer_sensitive = false; /* We can reset the sensitive flag once the buffer is empty */
+ } else
+ v->output_buffer_index += n;
+
+ close_many(v->output_fds, v->n_output_fds);
+ v->n_output_fds = 0;
+
+ v->timestamp = now(CLOCK_MONOTONIC);
+ return 1;
+}
+
+#define VARLINK_FDS_MAX (16U*1024U)
+
+static int varlink_read(Varlink *v) {
+ struct iovec iov;
+ struct msghdr mh;
+ size_t rs;
+ ssize_t n;
+ void *p;
+
+ assert(v);
+
+ if (!IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING, VARLINK_IDLE_SERVER))
+ return 0;
+ if (v->connecting) /* read() on a socket while we are in connect() will fail with EINVAL, hence exit early here */
+ return 0;
+ if (v->current)
+ return 0;
+ if (v->input_buffer_unscanned > 0)
+ return 0;
+ if (v->read_disconnected)
+ return 0;
+
+ if (v->input_buffer_size >= VARLINK_BUFFER_MAX)
+ return -ENOBUFS;
+
+ assert(v->fd >= 0);
+
+ if (MALLOC_SIZEOF_SAFE(v->input_buffer) <= v->input_buffer_index + v->input_buffer_size) {
+ size_t add;
+
+ add = MIN(VARLINK_BUFFER_MAX - v->input_buffer_size, VARLINK_READ_SIZE);
+
+ if (v->input_buffer_index == 0) {
+
+ if (!GREEDY_REALLOC(v->input_buffer, v->input_buffer_size + add))
+ return -ENOMEM;
+
+ } else {
+ char *b;
+
+ b = new(char, v->input_buffer_size + add);
+ if (!b)
+ return -ENOMEM;
+
+ memcpy(b, v->input_buffer + v->input_buffer_index, v->input_buffer_size);
+
+ free_and_replace(v->input_buffer, b);
+ v->input_buffer_index = 0;
+ }
+ }
+
+ p = v->input_buffer + v->input_buffer_index + v->input_buffer_size;
+ rs = MALLOC_SIZEOF_SAFE(v->input_buffer) - (v->input_buffer_index + v->input_buffer_size);
+
+ if (v->allow_fd_passing_input) {
+ iov = IOVEC_MAKE(p, rs);
+
+ /* Allocate the fd buffer on the heap, since we need a lot of space potentially */
+ if (!v->input_control_buffer) {
+ v->input_control_buffer_size = CMSG_SPACE(sizeof(int) * VARLINK_FDS_MAX);
+ v->input_control_buffer = malloc(v->input_control_buffer_size);
+ if (!v->input_control_buffer)
+ return -ENOMEM;
+ }
+
+ mh = (struct msghdr) {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = v->input_control_buffer,
+ .msg_controllen = v->input_control_buffer_size,
+ };
+
+ n = recvmsg_safe(v->fd, &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
+ } else {
+ bool prefer_read = v->prefer_read_write;
+ if (!prefer_read) {
+ n = recv(v->fd, p, rs, MSG_DONTWAIT);
+ if (n < 0 && errno == ENOTSOCK)
+ prefer_read = v->prefer_read_write = true;
+ }
+ if (prefer_read)
+ n = read(v->fd, p, rs);
+ }
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+
+ if (ERRNO_IS_DISCONNECT(errno)) {
+ v->read_disconnected = true;
+ return 1;
+ }
+
+ return -errno;
+ }
+ if (n == 0) { /* EOF */
+
+ if (v->allow_fd_passing_input)
+ cmsg_close_all(&mh);
+
+ v->read_disconnected = true;
+ return 1;
+ }
+
+ if (v->allow_fd_passing_input) {
+ struct cmsghdr* cmsg;
+
+ cmsg = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1);
+ if (cmsg) {
+ size_t add;
+
+ /* We only allow file descriptors to be passed along with the first byte of a
+ * message. If they are passed with any other byte this is a protocol violation. */
+ if (v->input_buffer_size != 0) {
+ cmsg_close_all(&mh);
+ return -EPROTO;
+ }
+
+ add = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+ if (add > INT_MAX - v->n_input_fds) {
+ cmsg_close_all(&mh);
+ return -EBADF;
+ }
+
+ if (!GREEDY_REALLOC(v->input_fds, v->n_input_fds + add)) {
+ cmsg_close_all(&mh);
+ return -ENOMEM;
+ }
+
+ memcpy_safe(v->input_fds + v->n_input_fds, CMSG_TYPED_DATA(cmsg, int), add * sizeof(int));
+ v->n_input_fds += add;
+ }
+ }
+
+ v->input_buffer_size += n;
+ v->input_buffer_unscanned += n;
+
+ return 1;
+}
+
+static int varlink_parse_message(Varlink *v) {
+ const char *e, *begin;
+ size_t sz;
+ int r;
+
+ assert(v);
+
+ if (v->current)
+ return 0;
+ if (v->input_buffer_unscanned <= 0)
+ return 0;
+
+ assert(v->input_buffer_unscanned <= v->input_buffer_size);
+ assert(v->input_buffer_index + v->input_buffer_size <= MALLOC_SIZEOF_SAFE(v->input_buffer));
+
+ begin = v->input_buffer + v->input_buffer_index;
+
+ e = memchr(begin + v->input_buffer_size - v->input_buffer_unscanned, 0, v->input_buffer_unscanned);
+ if (!e) {
+ v->input_buffer_unscanned = 0;
+ return 0;
+ }
+
+ sz = e - begin + 1;
+
+ varlink_log(v, "New incoming message: %s", begin); /* FIXME: should we output the whole message here before validation?
+ * This may produce a non-printable journal entry if the message
+ * is invalid. We may also expose privileged information. */
+
+ r = json_parse(begin, 0, &v->current, NULL, NULL);
+ if (r < 0) {
+ /* If we encounter a parse failure flush all data. We cannot possibly recover from this,
+ * hence drop all buffered data now. */
+ v->input_buffer_index = v->input_buffer_size = v->input_buffer_unscanned = 0;
+ return varlink_log_errno(v, r, "Failed to parse JSON: %m");
+ }
+
+ v->input_buffer_size -= sz;
+
+ if (v->input_buffer_size == 0)
+ v->input_buffer_index = 0;
+ else
+ v->input_buffer_index += sz;
+
+ v->input_buffer_unscanned = v->input_buffer_size;
+ return 1;
+}
+
+static int varlink_test_timeout(Varlink *v) {
+ assert(v);
+
+ if (!IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING))
+ return 0;
+ if (v->timeout == USEC_INFINITY)
+ return 0;
+
+ if (now(CLOCK_MONOTONIC) < usec_add(v->timestamp, v->timeout))
+ return 0;
+
+ varlink_set_state(v, VARLINK_PENDING_TIMEOUT);
+
+ return 1;
+}
+
+static int varlink_dispatch_local_error(Varlink *v, const char *error) {
+ int r;
+
+ assert(v);
+ assert(error);
+
+ if (!v->reply_callback)
+ return 0;
+
+ r = v->reply_callback(v, NULL, error, VARLINK_REPLY_ERROR|VARLINK_REPLY_LOCAL, v->userdata);
+ if (r < 0)
+ log_debug_errno(r, "Reply callback returned error, ignoring: %m");
+
+ return 1;
+}
+
+static int varlink_dispatch_timeout(Varlink *v) {
+ assert(v);
+
+ if (v->state != VARLINK_PENDING_TIMEOUT)
+ return 0;
+
+ varlink_set_state(v, VARLINK_PROCESSING_TIMEOUT);
+ varlink_dispatch_local_error(v, VARLINK_ERROR_TIMEOUT);
+ varlink_close(v);
+
+ return 1;
+}
+
+static int varlink_dispatch_disconnect(Varlink *v) {
+ assert(v);
+
+ if (v->state != VARLINK_PENDING_DISCONNECT)
+ return 0;
+
+ varlink_set_state(v, VARLINK_PROCESSING_DISCONNECT);
+ varlink_dispatch_local_error(v, VARLINK_ERROR_DISCONNECTED);
+ varlink_close(v);
+
+ return 1;
+}
+
+static int varlink_sanitize_parameters(JsonVariant **v) {
+ int r;
+
+ assert(v);
+
+ /* Varlink always wants a parameters list, hence make one if the caller doesn't want any */
+ if (!*v)
+ return json_variant_new_object(v, NULL, 0);
+ if (json_variant_is_null(*v)) {
+ JsonVariant *empty;
+
+ r = json_variant_new_object(&empty, NULL, 0);
+ if (r < 0)
+ return r;
+
+ json_variant_unref(*v);
+ *v = empty;
+ return 0;
+ }
+ if (!json_variant_is_object(*v))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int varlink_dispatch_reply(Varlink *v) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ VarlinkReplyFlags flags = 0;
+ const char *error = NULL;
+ JsonVariant *e;
+ const char *k;
+ int r;
+
+ assert(v);
+
+ if (!IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING))
+ return 0;
+ if (!v->current)
+ return 0;
+
+ assert(v->n_pending > 0);
+
+ if (!json_variant_is_object(v->current))
+ goto invalid;
+
+ JSON_VARIANT_OBJECT_FOREACH(k, e, v->current) {
+
+ if (streq(k, "error")) {
+ if (error)
+ goto invalid;
+ if (!json_variant_is_string(e))
+ goto invalid;
+
+ error = json_variant_string(e);
+ flags |= VARLINK_REPLY_ERROR;
+
+ } else if (streq(k, "parameters")) {
+ if (parameters)
+ goto invalid;
+ if (!json_variant_is_object(e) && !json_variant_is_null(e))
+ goto invalid;
+
+ parameters = json_variant_ref(e);
+
+ } else if (streq(k, "continues")) {
+ if (FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ goto invalid;
+
+ if (!json_variant_is_boolean(e))
+ goto invalid;
+
+ if (json_variant_boolean(e))
+ flags |= VARLINK_REPLY_CONTINUES;
+ } else
+ goto invalid;
+ }
+
+ /* Replies with 'continue' set are only OK if we set 'more' when the method call was initiated */
+ if (v->state != VARLINK_AWAITING_REPLY_MORE && FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ goto invalid;
+
+ /* An error is final */
+ if (error && FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ goto invalid;
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ goto invalid;
+
+ if (IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE)) {
+ varlink_set_state(v, VARLINK_PROCESSING_REPLY);
+
+ if (v->reply_callback) {
+ r = v->reply_callback(v, parameters, error, flags, v->userdata);
+ if (r < 0)
+ log_debug_errno(r, "Reply callback returned error, ignoring: %m");
+ }
+
+ varlink_clear_current(v);
+
+ if (v->state == VARLINK_PROCESSING_REPLY) {
+
+ assert(v->n_pending > 0);
+
+ if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
+ v->n_pending--;
+
+ varlink_set_state(v,
+ FLAGS_SET(flags, VARLINK_REPLY_CONTINUES) ? VARLINK_AWAITING_REPLY_MORE :
+ v->n_pending == 0 ? VARLINK_IDLE_CLIENT : VARLINK_AWAITING_REPLY);
+ }
+ } else {
+ assert(v->state == VARLINK_CALLING);
+ varlink_set_state(v, VARLINK_CALLED);
+ }
+
+ return 1;
+
+invalid:
+ varlink_set_state(v, VARLINK_PROCESSING_FAILURE);
+ varlink_dispatch_local_error(v, VARLINK_ERROR_PROTOCOL);
+ varlink_close(v);
+
+ return 1;
+}
+
+static int generic_method_get_info(
+ Varlink *link,
+ JsonVariant *parameters,
+ VarlinkMethodFlags flags,
+ void *userdata) {
+
+ _cleanup_strv_free_ char **interfaces = NULL;
+ _cleanup_free_ char *product = NULL;
+ int r;
+
+ assert(link);
+
+ if (json_variant_elements(parameters) != 0)
+ return varlink_errorb(link, VARLINK_ERROR_INVALID_PARAMETER,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_VARIANT("parameter", json_variant_by_index(parameters, 0))));
+
+ product = strjoin("systemd (", program_invocation_short_name, ")");
+ if (!product)
+ return -ENOMEM;
+
+ VarlinkInterface *interface;
+ HASHMAP_FOREACH(interface, ASSERT_PTR(link->server)->interfaces) {
+ r = strv_extend(&interfaces, interface->name);
+ if (r < 0)
+ return r;
+ }
+
+ strv_sort(interfaces);
+
+ return varlink_replyb(link, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_STRING("vendor", "The systemd Project"),
+ JSON_BUILD_PAIR_STRING("product", product),
+ JSON_BUILD_PAIR_STRING("version", STRINGIFY(PROJECT_VERSION) " (" GIT_VERSION ")"),
+ JSON_BUILD_PAIR_STRING("url", "https://systemd.io/"),
+ JSON_BUILD_PAIR_STRV("interfaces", interfaces)));
+}
+
+static int generic_method_get_interface_description(
+ Varlink *link,
+ JsonVariant *parameters,
+ VarlinkMethodFlags flags,
+ void *userdata) {
+
+ static const struct JsonDispatch dispatch_table[] = {
+ { "interface", JSON_VARIANT_STRING, json_dispatch_const_string, 0, JSON_MANDATORY },
+ {}
+ };
+ _cleanup_free_ char *text = NULL;
+ const VarlinkInterface *interface;
+ const char *name = NULL;
+ int r;
+
+ assert(link);
+
+ r = json_dispatch(parameters, dispatch_table, 0, &name);
+ if (r < 0)
+ return r;
+
+ interface = hashmap_get(ASSERT_PTR(link->server)->interfaces, name);
+ if (!interface)
+ return varlink_errorb(link, VARLINK_ERROR_INTERFACE_NOT_FOUND,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_STRING("interface", name)));
+
+ r = varlink_idl_format(interface, &text);
+ if (r < 0)
+ return r;
+
+ return varlink_replyb(link,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR_STRING("description", text)));
+}
+
+static int varlink_dispatch_method(Varlink *v) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ VarlinkMethodFlags flags = 0;
+ const char *method = NULL;
+ JsonVariant *e;
+ VarlinkMethod callback;
+ const char *k;
+ int r;
+
+ assert(v);
+
+ if (v->state != VARLINK_IDLE_SERVER)
+ return 0;
+ if (!v->current)
+ return 0;
+
+ if (!json_variant_is_object(v->current))
+ goto invalid;
+
+ JSON_VARIANT_OBJECT_FOREACH(k, e, v->current) {
+
+ if (streq(k, "method")) {
+ if (method)
+ goto invalid;
+ if (!json_variant_is_string(e))
+ goto invalid;
+
+ method = json_variant_string(e);
+
+ } else if (streq(k, "parameters")) {
+ if (parameters)
+ goto invalid;
+ if (!json_variant_is_object(e) && !json_variant_is_null(e))
+ goto invalid;
+
+ parameters = json_variant_ref(e);
+
+ } else if (streq(k, "oneway")) {
+
+ if ((flags & (VARLINK_METHOD_ONEWAY|VARLINK_METHOD_MORE)) != 0)
+ goto invalid;
+
+ if (!json_variant_is_boolean(e))
+ goto invalid;
+
+ if (json_variant_boolean(e))
+ flags |= VARLINK_METHOD_ONEWAY;
+
+ } else if (streq(k, "more")) {
+
+ if ((flags & (VARLINK_METHOD_ONEWAY|VARLINK_METHOD_MORE)) != 0)
+ goto invalid;
+
+ if (!json_variant_is_boolean(e))
+ goto invalid;
+
+ if (json_variant_boolean(e))
+ flags |= VARLINK_METHOD_MORE;
+
+ } else
+ goto invalid;
+ }
+
+ if (!method)
+ goto invalid;
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ goto fail;
+
+ varlink_set_state(v, (flags & VARLINK_METHOD_MORE) ? VARLINK_PROCESSING_METHOD_MORE :
+ (flags & VARLINK_METHOD_ONEWAY) ? VARLINK_PROCESSING_METHOD_ONEWAY :
+ VARLINK_PROCESSING_METHOD);
+
+ assert(v->server);
+
+ /* First consult user supplied method implementations */
+ callback = hashmap_get(v->server->methods, method);
+ if (!callback) {
+ if (streq(method, "org.varlink.service.GetInfo"))
+ callback = generic_method_get_info;
+ else if (streq(method, "org.varlink.service.GetInterfaceDescription"))
+ callback = generic_method_get_interface_description;
+ }
+
+ if (callback) {
+ bool invalid = false;
+
+ v->current_method = hashmap_get(v->server->symbols, method);
+ if (!v->current_method)
+ log_debug("No interface description defined for method '%s', not validating.", method);
+ else {
+ const char *bad_field;
+
+ r = varlink_idl_validate_method_call(v->current_method, parameters, &bad_field);
+ if (r < 0) {
+ log_debug_errno(r, "Parameters for method %s() didn't pass validation on field '%s': %m", method, strna(bad_field));
+
+ if (!FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) {
+ r = varlink_errorb(v, VARLINK_ERROR_INVALID_PARAMETER, JSON_BUILD_OBJECT(JSON_BUILD_PAIR_STRING("parameter", bad_field)));
+ if (r < 0)
+ return r;
+ }
+ invalid = true;
+ }
+ }
+
+ if (!invalid) {
+ r = callback(v, parameters, flags, v->userdata);
+ if (r < 0) {
+ log_debug_errno(r, "Callback for %s returned error: %m", method);
+
+ /* We got an error back from the callback. Propagate it to the client if the method call remains unanswered. */
+ if (v->state == VARLINK_PROCESSED_METHOD)
+ r = 0; /* already processed */
+ else if (!FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) {
+ r = varlink_error_errno(v, r);
+ if (r < 0)
+ return r;
+ }
+ }
+ }
+ } else if (!FLAGS_SET(flags, VARLINK_METHOD_ONEWAY)) {
+ r = varlink_errorb(v, VARLINK_ERROR_METHOD_NOT_FOUND, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method))));
+ if (r < 0)
+ return r;
+ } else
+ r = 0;
+
+ switch (v->state) {
+
+ case VARLINK_PROCESSED_METHOD: /* Method call is fully processed */
+ case VARLINK_PROCESSING_METHOD_ONEWAY: /* ditto */
+ varlink_clear_current(v);
+ varlink_set_state(v, VARLINK_IDLE_SERVER);
+ break;
+
+ case VARLINK_PROCESSING_METHOD: /* Method call wasn't replied to, will be replied to later */
+ varlink_set_state(v, VARLINK_PENDING_METHOD);
+ break;
+
+ case VARLINK_PROCESSING_METHOD_MORE: /* No reply for a "more" message was sent, more to come */
+ varlink_set_state(v, VARLINK_PENDING_METHOD_MORE);
+ break;
+
+ default:
+ assert_not_reached();
+ }
+
+ return r;
+
+invalid:
+ r = -EINVAL;
+
+fail:
+ varlink_set_state(v, VARLINK_PROCESSING_FAILURE);
+ varlink_dispatch_local_error(v, VARLINK_ERROR_PROTOCOL);
+ varlink_close(v);
+
+ return r;
+}
+
+int varlink_process(Varlink *v) {
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ varlink_ref(v);
+
+ r = varlink_write(v);
+ if (r < 0)
+ varlink_log_errno(v, r, "Write failed: %m");
+ if (r != 0)
+ goto finish;
+
+ r = varlink_dispatch_reply(v);
+ if (r < 0)
+ varlink_log_errno(v, r, "Reply dispatch failed: %m");
+ if (r != 0)
+ goto finish;
+
+ r = varlink_dispatch_method(v);
+ if (r < 0)
+ varlink_log_errno(v, r, "Method dispatch failed: %m");
+ if (r != 0)
+ goto finish;
+
+ r = varlink_parse_message(v);
+ if (r < 0)
+ varlink_log_errno(v, r, "Message parsing failed: %m");
+ if (r != 0)
+ goto finish;
+
+ r = varlink_read(v);
+ if (r < 0)
+ varlink_log_errno(v, r, "Read failed: %m");
+ if (r != 0)
+ goto finish;
+
+ r = varlink_test_disconnect(v);
+ assert(r >= 0);
+ if (r != 0)
+ goto finish;
+
+ r = varlink_dispatch_disconnect(v);
+ assert(r >= 0);
+ if (r != 0)
+ goto finish;
+
+ r = varlink_test_timeout(v);
+ assert(r >= 0);
+ if (r != 0)
+ goto finish;
+
+ r = varlink_dispatch_timeout(v);
+ assert(r >= 0);
+ if (r != 0)
+ goto finish;
+
+finish:
+ if (r >= 0 && v->defer_event_source) {
+ int q;
+
+ /* If we did some processing, make sure we are called again soon */
+ q = sd_event_source_set_enabled(v->defer_event_source, r > 0 ? SD_EVENT_ON : SD_EVENT_OFF);
+ if (q < 0)
+ r = varlink_log_errno(v, q, "Failed to enable deferred event source: %m");
+ }
+
+ if (r < 0) {
+ if (VARLINK_STATE_IS_ALIVE(v->state))
+ /* Initiate disconnection */
+ varlink_set_state(v, VARLINK_PENDING_DISCONNECT);
+ else
+ /* We failed while disconnecting, in that case close right away */
+ varlink_close(v);
+ }
+
+ varlink_unref(v);
+ return r;
+}
+
+static void handle_revents(Varlink *v, int revents) {
+ assert(v);
+
+ if (v->connecting) {
+ /* If we have seen POLLOUT or POLLHUP on a socket we are asynchronously waiting a connect()
+ * to complete on, we know we are ready. We don't read the connection error here though,
+ * we'll get the error on the next read() or write(). */
+ if ((revents & (POLLOUT|POLLHUP)) == 0)
+ return;
+
+ varlink_log(v, "Asynchronous connection completed.");
+ v->connecting = false;
+ } else {
+ /* Note that we don't care much about POLLIN/POLLOUT here, we'll just try reading and writing
+ * what we can. However, we do care about POLLHUP to detect connection termination even if we
+ * momentarily don't want to read nor write anything. */
+
+ if (!FLAGS_SET(revents, POLLHUP))
+ return;
+
+ varlink_log(v, "Got POLLHUP from socket.");
+ v->got_pollhup = true;
+ }
+}
+
+int varlink_wait(Varlink *v, usec_t timeout) {
+ int r, fd, events;
+ usec_t t;
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ r = varlink_get_timeout(v, &t);
+ if (r < 0)
+ return r;
+ if (t != USEC_INFINITY) {
+ usec_t n;
+
+ n = now(CLOCK_MONOTONIC);
+ if (t < n)
+ t = 0;
+ else
+ t = usec_sub_unsigned(t, n);
+ }
+
+ if (timeout != USEC_INFINITY &&
+ (t == USEC_INFINITY || timeout < t))
+ t = timeout;
+
+ fd = varlink_get_fd(v);
+ if (fd < 0)
+ return fd;
+
+ events = varlink_get_events(v);
+ if (events < 0)
+ return events;
+
+ r = fd_wait_for_event(fd, events, t);
+ if (ERRNO_IS_NEG_TRANSIENT(r)) /* Treat EINTR as not a timeout, but also nothing happened, and
+ * the caller gets a chance to call back into us */
+ return 1;
+ if (r <= 0)
+ return r;
+
+ handle_revents(v, r);
+ return 1;
+}
+
+int varlink_is_idle(Varlink *v) {
+ assert_return(v, -EINVAL);
+
+ /* Returns true if there's nothing pending on the connection anymore, i.e. we processed all incoming
+ * or outgoing messages fully, or finished disconnection */
+
+ return IN_SET(v->state, VARLINK_DISCONNECTED, VARLINK_IDLE_CLIENT, VARLINK_IDLE_SERVER);
+}
+
+int varlink_get_fd(Varlink *v) {
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+ if (v->fd < 0)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBADF), "No valid fd.");
+
+ return v->fd;
+}
+
+int varlink_get_events(Varlink *v) {
+ int ret = 0;
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ if (v->connecting) /* When processing an asynchronous connect(), we only wait for EPOLLOUT, which
+ * tells us that the connection is now complete. Before that we should neither
+ * write() or read() from the fd. */
+ return EPOLLOUT;
+
+ if (!v->read_disconnected &&
+ IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING, VARLINK_IDLE_SERVER) &&
+ !v->current &&
+ v->input_buffer_unscanned <= 0)
+ ret |= EPOLLIN;
+
+ if (!v->write_disconnected &&
+ v->output_buffer_size > 0)
+ ret |= EPOLLOUT;
+
+ return ret;
+}
+
+int varlink_get_timeout(Varlink *v, usec_t *ret) {
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ if (IN_SET(v->state, VARLINK_AWAITING_REPLY, VARLINK_AWAITING_REPLY_MORE, VARLINK_CALLING) &&
+ v->timeout != USEC_INFINITY) {
+ if (ret)
+ *ret = usec_add(v->timestamp, v->timeout);
+ return 1;
+ } else {
+ if (ret)
+ *ret = USEC_INFINITY;
+ return 0;
+ }
+}
+
+int varlink_flush(Varlink *v) {
+ int ret = 0, r;
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ for (;;) {
+ if (v->output_buffer_size == 0)
+ break;
+ if (v->write_disconnected)
+ return -ECONNRESET;
+
+ r = varlink_write(v);
+ if (r < 0)
+ return r;
+ if (r > 0) {
+ ret = 1;
+ continue;
+ }
+
+ r = fd_wait_for_event(v->fd, POLLOUT, USEC_INFINITY);
+ if (ERRNO_IS_NEG_TRANSIENT(r))
+ continue;
+ if (r < 0)
+ return varlink_log_errno(v, r, "Poll failed on fd: %m");
+ assert(r > 0);
+
+ handle_revents(v, r);
+ }
+
+ return ret;
+}
+
+static void varlink_detach_server(Varlink *v) {
+ VarlinkServer *saved_server;
+ assert(v);
+
+ if (!v->server)
+ return;
+
+ if (v->server->by_uid &&
+ v->ucred_acquired &&
+ uid_is_valid(v->ucred.uid)) {
+ unsigned c;
+
+ c = PTR_TO_UINT(hashmap_get(v->server->by_uid, UID_TO_PTR(v->ucred.uid)));
+ assert(c > 0);
+
+ if (c == 1)
+ (void) hashmap_remove(v->server->by_uid, UID_TO_PTR(v->ucred.uid));
+ else
+ (void) hashmap_replace(v->server->by_uid, UID_TO_PTR(v->ucred.uid), UINT_TO_PTR(c - 1));
+ }
+
+ assert(v->server->n_connections > 0);
+ v->server->n_connections--;
+
+ /* If this is a connection associated to a server, then let's disconnect the server and the
+ * connection from each other. This drops the dangling reference that connect_callback() set up. But
+ * before we release the references, let's call the disconnection callback if it is defined. */
+
+ saved_server = TAKE_PTR(v->server);
+
+ if (saved_server->disconnect_callback)
+ saved_server->disconnect_callback(saved_server, v, saved_server->userdata);
+
+ varlink_server_test_exit_on_idle(saved_server);
+ varlink_server_unref(saved_server);
+ varlink_unref(v);
+}
+
+int varlink_close(Varlink *v) {
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return 0;
+
+ varlink_set_state(v, VARLINK_DISCONNECTED);
+
+ /* Let's take a reference first, since varlink_detach_server() might drop the final (dangling) ref
+ * which would destroy us before we can call varlink_clear() */
+ varlink_ref(v);
+ varlink_detach_server(v);
+ varlink_clear(v);
+ varlink_unref(v);
+
+ return 1;
+}
+
+Varlink* varlink_close_unref(Varlink *v) {
+ if (!v)
+ return NULL;
+
+ (void) varlink_close(v);
+ return varlink_unref(v);
+}
+
+Varlink* varlink_flush_close_unref(Varlink *v) {
+ if (!v)
+ return NULL;
+
+ (void) varlink_flush(v);
+ return varlink_close_unref(v);
+}
+
+static int varlink_format_json(Varlink *v, JsonVariant *m) {
+ _cleanup_(erase_and_freep) char *text = NULL;
+ int r;
+
+ assert(v);
+ assert(m);
+
+ r = json_variant_format(m, 0, &text);
+ if (r < 0)
+ return r;
+ assert(text[r] == '\0');
+
+ if (v->output_buffer_size + r + 1 > VARLINK_BUFFER_MAX)
+ return -ENOBUFS;
+
+ varlink_log(v, "Sending message: %s", text);
+
+ if (v->output_buffer_size == 0) {
+
+ free_and_replace(v->output_buffer, text);
+
+ v->output_buffer_size = r + 1;
+ v->output_buffer_index = 0;
+
+ } else if (v->output_buffer_index == 0) {
+
+ if (!GREEDY_REALLOC(v->output_buffer, v->output_buffer_size + r + 1))
+ return -ENOMEM;
+
+ memcpy(v->output_buffer + v->output_buffer_size, text, r + 1);
+ v->output_buffer_size += r + 1;
+ } else {
+ char *n;
+ const size_t new_size = v->output_buffer_size + r + 1;
+
+ n = new(char, new_size);
+ if (!n)
+ return -ENOMEM;
+
+ memcpy(mempcpy(n, v->output_buffer + v->output_buffer_index, v->output_buffer_size), text, r + 1);
+
+ free_and_replace(v->output_buffer, n);
+ v->output_buffer_size = new_size;
+ v->output_buffer_index = 0;
+ }
+
+ if (json_variant_is_sensitive(m))
+ v->output_buffer_sensitive = true; /* Propagate sensitive flag */
+ else
+ text = mfree(text); /* No point in the erase_and_free() destructor declared above */
+
+ return 0;
+}
+
+static int varlink_enqueue_json(Varlink *v, JsonVariant *m) {
+ VarlinkJsonQueueItem *q;
+
+ assert(v);
+ assert(m);
+
+ /* If there are no file descriptors to be queued and no queue entries yet we can shortcut things and
+ * append this entry directly to the output buffer */
+ if (v->n_pushed_fds == 0 && !v->output_queue)
+ return varlink_format_json(v, m);
+
+ /* Otherwise add a queue entry for this */
+ q = varlink_json_queue_item_new(m, v->pushed_fds, v->n_pushed_fds);
+ if (!q)
+ return -ENOMEM;
+
+ v->n_pushed_fds = 0; /* fds now belong to the queue entry */
+
+ LIST_INSERT_AFTER(queue, v->output_queue, v->output_queue_tail, q);
+ v->output_queue_tail = q;
+ return 0;
+}
+
+static int varlink_format_queue(Varlink *v) {
+ int r;
+
+ assert(v);
+
+ /* Takes entries out of the output queue and formats them into the output buffer. But only if this
+ * would not corrupt our fd message boundaries */
+
+ while (v->output_queue) {
+ _cleanup_free_ int *array = NULL;
+ VarlinkJsonQueueItem *q = v->output_queue;
+
+ if (v->n_output_fds > 0) /* unwritten fds? if we'd add more we'd corrupt the fd message boundaries, hence wait */
+ return 0;
+
+ if (q->n_fds > 0) {
+ array = newdup(int, q->fds, q->n_fds);
+ if (!array)
+ return -ENOMEM;
+ }
+
+ r = varlink_format_json(v, q->data);
+ if (r < 0)
+ return r;
+
+ /* Take possession of the queue element's fds */
+ free(v->output_fds);
+ v->output_fds = TAKE_PTR(array);
+ v->n_output_fds = q->n_fds;
+ q->n_fds = 0;
+
+ LIST_REMOVE(queue, v->output_queue, q);
+ if (!v->output_queue)
+ v->output_queue_tail = NULL;
+
+ varlink_json_queue_item_free(q);
+ }
+
+ return 0;
+}
+
+int varlink_send(Varlink *v, const char *method, JsonVariant *parameters) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(method, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ /* We allow enqueuing multiple method calls at once! */
+ if (!IN_SET(v->state, VARLINK_IDLE_CLIENT, VARLINK_AWAITING_REPLY))
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)),
+ JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)),
+ JSON_BUILD_PAIR("oneway", JSON_BUILD_BOOLEAN(true))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ /* No state change here, this is one-way only after all */
+ v->timestamp = now(CLOCK_MONOTONIC);
+ return 0;
+}
+
+int varlink_sendb(Varlink *v, const char *method, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, method);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_send(v, method, parameters);
+}
+
+int varlink_invoke(Varlink *v, const char *method, JsonVariant *parameters) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(method, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ /* We allow enqueuing multiple method calls at once! */
+ if (!IN_SET(v->state, VARLINK_IDLE_CLIENT, VARLINK_AWAITING_REPLY))
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)),
+ JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ varlink_set_state(v, VARLINK_AWAITING_REPLY);
+ v->n_pending++;
+ v->timestamp = now(CLOCK_MONOTONIC);
+
+ return 0;
+}
+
+int varlink_invokeb(Varlink *v, const char *method, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, method);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_invoke(v, method, parameters);
+}
+
+int varlink_observe(Varlink *v, const char *method, JsonVariant *parameters) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(method, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ /* Note that we don't allow enqueuing multiple method calls when we are in more/continues mode! We
+ * thus insist on an idle client here. */
+ if (v->state != VARLINK_IDLE_CLIENT)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)),
+ JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)),
+ JSON_BUILD_PAIR("more", JSON_BUILD_BOOLEAN(true))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ varlink_set_state(v, VARLINK_AWAITING_REPLY_MORE);
+ v->n_pending++;
+ v->timestamp = now(CLOCK_MONOTONIC);
+
+ return 0;
+}
+
+int varlink_observeb(Varlink *v, const char *method, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, method);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_observe(v, method, parameters);
+}
+
+int varlink_call(
+ Varlink *v,
+ const char *method,
+ JsonVariant *parameters,
+ JsonVariant **ret_parameters,
+ const char **ret_error_id,
+ VarlinkReplyFlags *ret_flags) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(method, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+ if (v->state != VARLINK_IDLE_CLIENT)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ assert(v->n_pending == 0); /* n_pending can't be > 0 if we are in VARLINK_IDLE_CLIENT state */
+
+ /* If there was still a reply pinned from a previous call, now it's the time to get rid of it, so
+ * that we can assign a new reply shortly. */
+ varlink_clear_current(v);
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("method", JSON_BUILD_STRING(method)),
+ JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ varlink_set_state(v, VARLINK_CALLING);
+ v->n_pending++;
+ v->timestamp = now(CLOCK_MONOTONIC);
+
+ while (v->state == VARLINK_CALLING) {
+
+ r = varlink_process(v);
+ if (r < 0)
+ return r;
+ if (r > 0)
+ continue;
+
+ r = varlink_wait(v, USEC_INFINITY);
+ if (r < 0)
+ return r;
+ }
+
+ switch (v->state) {
+
+ case VARLINK_CALLED:
+ assert(v->current);
+
+ varlink_set_state(v, VARLINK_IDLE_CLIENT);
+ assert(v->n_pending == 1);
+ v->n_pending--;
+
+ if (ret_parameters)
+ *ret_parameters = json_variant_by_key(v->current, "parameters");
+ if (ret_error_id)
+ *ret_error_id = json_variant_string(json_variant_by_key(v->current, "error"));
+ if (ret_flags)
+ *ret_flags = 0;
+
+ return 1;
+
+ case VARLINK_PENDING_DISCONNECT:
+ case VARLINK_DISCONNECTED:
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ECONNRESET), "Connection was closed.");
+
+ case VARLINK_PENDING_TIMEOUT:
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ETIME), "Connection timed out.");
+
+ default:
+ assert_not_reached();
+ }
+}
+
+int varlink_callb(
+ Varlink *v,
+ const char *method,
+ JsonVariant **ret_parameters,
+ const char **ret_error_id,
+ VarlinkReplyFlags *ret_flags, ...) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, ret_flags);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_call(v, method, parameters, ret_parameters, ret_error_id, ret_flags);
+}
+
+static void varlink_collect_context_free(VarlinkCollectContext *cc) {
+ assert(cc);
+
+ json_variant_unref(cc->parameters);
+ free((char *)cc->error_id);
+}
+
+static int collect_callback(
+ Varlink *v,
+ JsonVariant *parameters,
+ const char *error_id,
+ VarlinkReplyFlags flags,
+ void *userdata) {
+
+ VarlinkCollectContext *context = ASSERT_PTR(userdata);
+ int r;
+
+ assert(v);
+
+ context->flags = flags;
+ /* If we hit an error, we will drop all collected replies and just return the error_id and flags in varlink_collect() */
+ if (error_id) {
+ context->error_id = error_id;
+ return 0;
+ }
+
+ r = json_variant_append_array(&context->parameters, parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to append JSON object to array: %m");
+
+ return 1;
+}
+
+int varlink_collect(
+ Varlink *v,
+ const char *method,
+ JsonVariant *parameters,
+ JsonVariant **ret_parameters,
+ const char **ret_error_id,
+ VarlinkReplyFlags *ret_flags) {
+
+ _cleanup_(varlink_collect_context_free) VarlinkCollectContext context = {};
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(method, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+ if (v->state != VARLINK_IDLE_CLIENT)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ assert(v->n_pending == 0); /* n_pending can't be > 0 if we are in VARLINK_IDLE_CLIENT state */
+
+ /* If there was still a reply pinned from a previous call, now it's the time to get rid of it, so
+ * that we can assign a new reply shortly. */
+ varlink_clear_current(v);
+
+ r = varlink_bind_reply(v, collect_callback);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to bind collect callback");
+
+ varlink_set_userdata(v, &context);
+ r = varlink_observe(v, method, parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to collect varlink method: %m");
+
+ while (v->state == VARLINK_AWAITING_REPLY_MORE) {
+
+ r = varlink_process(v);
+ if (r < 0)
+ return r;
+
+ /* If we get an error from any of the replies, return immediately with just the error_id and flags*/
+ if (context.error_id) {
+ if (ret_error_id)
+ *ret_error_id = TAKE_PTR(context.error_id);
+ if (ret_flags)
+ *ret_flags = context.flags;
+ return 0;
+ }
+
+ if (r > 0)
+ continue;
+
+ r = varlink_wait(v, USEC_INFINITY);
+ if (r < 0)
+ return r;
+ }
+
+ switch (v->state) {
+
+ case VARLINK_IDLE_CLIENT:
+ break;
+
+ case VARLINK_PENDING_DISCONNECT:
+ case VARLINK_DISCONNECTED:
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ECONNRESET), "Connection was closed.");
+
+ case VARLINK_PENDING_TIMEOUT:
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ETIME), "Connection timed out.");
+
+ default:
+ assert_not_reached();
+ }
+
+ if (ret_parameters)
+ *ret_parameters = TAKE_PTR(context.parameters);
+ if (ret_error_id)
+ *ret_error_id = TAKE_PTR(context.error_id);
+ if (ret_flags)
+ *ret_flags = context.flags;
+ return 1;
+}
+
+int varlink_collectb(
+ Varlink *v,
+ const char *method,
+ JsonVariant **ret_parameters,
+ const char **ret_error_id,
+ VarlinkReplyFlags *ret_flags, ...) {
+
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, ret_flags);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_collect(v, method, parameters, ret_parameters, ret_error_id, ret_flags);
+}
+
+int varlink_reply(Varlink *v, JsonVariant *parameters) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return -ENOTCONN;
+ if (!IN_SET(v->state,
+ VARLINK_PROCESSING_METHOD, VARLINK_PROCESSING_METHOD_MORE,
+ VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE))
+ return -EBUSY;
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ if (v->current_method) {
+ const char *bad_field = NULL;
+
+ r = varlink_idl_validate_method_reply(v->current_method, parameters, &bad_field);
+ if (r < 0)
+ log_debug_errno(r, "Return parameters for method reply %s() didn't pass validation on field '%s', ignoring: %m", v->current_method->name, strna(bad_field));
+ }
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE)) {
+ /* We just replied to a method call that was let hanging for a while (i.e. we were outside of
+ * the varlink_dispatch_method() stack frame), which means with this reply we are ready to
+ * process further messages. */
+ varlink_clear_current(v);
+ varlink_set_state(v, VARLINK_IDLE_SERVER);
+ } else
+ /* We replied to a method call from within the varlink_dispatch_method() stack frame), which
+ * means we should it handle the rest of the state engine. */
+ varlink_set_state(v, VARLINK_PROCESSED_METHOD);
+
+ return 1;
+}
+
+int varlink_replyb(Varlink *v, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, v);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return r;
+
+ return varlink_reply(v, parameters);
+}
+
+int varlink_error(Varlink *v, const char *error_id, JsonVariant *parameters) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(error_id, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+ if (!IN_SET(v->state,
+ VARLINK_PROCESSING_METHOD, VARLINK_PROCESSING_METHOD_MORE,
+ VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE))
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ /* Reset the list of pushed file descriptors before sending an error reply. We do this here to
+ * simplify code that puts together a complex reply message with fds, and half-way something
+ * fails. In that case the pushed fds need to be flushed out again. Under the assumption that it
+ * never makes sense to send fds along with errors we simply flush them out here beforehand, so that
+ * the callers don't need to do this explicitly. */
+ varlink_reset_fds(v);
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("error", JSON_BUILD_STRING(error_id)),
+ JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ VarlinkSymbol *symbol = hashmap_get(v->server->symbols, error_id);
+ if (!symbol)
+ log_debug("No interface description defined for error '%s', not validating.", error_id);
+ else {
+ const char *bad_field = NULL;
+
+ r = varlink_idl_validate_error(symbol, parameters, &bad_field);
+ if (r < 0)
+ log_debug_errno(r, "Parameters for error %s didn't pass validation on field '%s', ignoring: %m", error_id, strna(bad_field));
+ }
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ if (IN_SET(v->state, VARLINK_PENDING_METHOD, VARLINK_PENDING_METHOD_MORE)) {
+ varlink_clear_current(v);
+ varlink_set_state(v, VARLINK_IDLE_SERVER);
+ } else
+ varlink_set_state(v, VARLINK_PROCESSED_METHOD);
+
+ return 1;
+}
+
+int varlink_errorb(Varlink *v, const char *error_id, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(error_id, -EINVAL);
+
+ va_start(ap, error_id);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_error(v, error_id, parameters);
+}
+
+int varlink_error_invalid_parameter(Varlink *v, JsonVariant *parameters) {
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(parameters, -EINVAL);
+
+ /* We expect to be called in one of two ways: the 'parameters' argument is a string variant in which
+ * case it is the parameter key name that is invalid. Or the 'parameters' argument is an object
+ * variant in which case we'll pull out the first key. The latter mode is useful in functions that
+ * don't expect any arguments. */
+
+ /* varlink_error(...) expects a json object as the third parameter. Passing a string variant causes
+ * parameter sanitization to fail, and it returns -EINVAL. */
+
+ if (json_variant_is_string(parameters)) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters_obj = NULL;
+
+ r = json_build(&parameters_obj,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("parameter", JSON_BUILD_VARIANT(parameters))));
+ if (r < 0)
+ return r;
+
+ return varlink_error(v, VARLINK_ERROR_INVALID_PARAMETER, parameters_obj);
+ }
+
+ if (json_variant_is_object(parameters) &&
+ json_variant_elements(parameters) > 0) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters_obj = NULL;
+
+ r = json_build(&parameters_obj,
+ JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("parameter", JSON_BUILD_VARIANT(json_variant_by_index(parameters, 0)))));
+ if (r < 0)
+ return r;
+
+ return varlink_error(v, VARLINK_ERROR_INVALID_PARAMETER, parameters_obj);
+ }
+
+ return -EINVAL;
+}
+
+int varlink_error_errno(Varlink *v, int error) {
+ return varlink_errorb(
+ v,
+ VARLINK_ERROR_SYSTEM,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("errno", JSON_BUILD_INTEGER(abs(error)))));
+}
+
+int varlink_notify(Varlink *v, JsonVariant *parameters) {
+ _cleanup_(json_variant_unrefp) JsonVariant *m = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ if (v->state == VARLINK_DISCONNECTED)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENOTCONN), "Not connected.");
+
+ /* If we want to reply with a notify connection but the caller didn't set "more", then return an
+ * error indicating that we expected to be called with "more" set */
+ if (IN_SET(v->state, VARLINK_PROCESSING_METHOD, VARLINK_PENDING_METHOD))
+ return varlink_error(v, VARLINK_ERROR_EXPECTED_MORE, NULL);
+
+ if (!IN_SET(v->state, VARLINK_PROCESSING_METHOD_MORE, VARLINK_PENDING_METHOD_MORE))
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "Connection busy.");
+
+ r = varlink_sanitize_parameters(&parameters);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to sanitize parameters: %m");
+
+ r = json_build(&m, JSON_BUILD_OBJECT(
+ JSON_BUILD_PAIR("parameters", JSON_BUILD_VARIANT(parameters)),
+ JSON_BUILD_PAIR("continues", JSON_BUILD_BOOLEAN(true))));
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ if (v->current_method) {
+ const char *bad_field = NULL;
+
+ r = varlink_idl_validate_method_reply(v->current_method, parameters, &bad_field);
+ if (r < 0)
+ log_debug_errno(r, "Return parameters for method reply %s() didn't pass validation on field '%s', ignoring: %m", v->current_method->name, strna(bad_field));
+ }
+
+ r = varlink_enqueue_json(v, m);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enqueue json message: %m");
+
+ /* No state change, as more is coming */
+ return 1;
+}
+
+int varlink_notifyb(Varlink *v, ...) {
+ _cleanup_(json_variant_unrefp) JsonVariant *parameters = NULL;
+ va_list ap;
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ va_start(ap, v);
+ r = json_buildv(&parameters, ap);
+ va_end(ap);
+
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to build json message: %m");
+
+ return varlink_notify(v, parameters);
+}
+
+int varlink_dispatch(Varlink *v, JsonVariant *parameters, const JsonDispatch table[], void *userdata) {
+ const char *bad_field = NULL;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(table, -EINVAL);
+
+ /* A wrapper around json_dispatch_full() that returns a nice InvalidParameter error if we hit a problem with some field. */
+
+ r = json_dispatch_full(parameters, table, /* bad= */ NULL, /* flags= */ 0, userdata, &bad_field);
+ if (r < 0) {
+ if (bad_field)
+ return varlink_errorb(v, VARLINK_ERROR_INVALID_PARAMETER,
+ JSON_BUILD_OBJECT(JSON_BUILD_PAIR("parameter", JSON_BUILD_STRING(bad_field))));
+ return r;
+ }
+
+ return 0;
+}
+
+int varlink_bind_reply(Varlink *v, VarlinkReply callback) {
+ assert_return(v, -EINVAL);
+
+ if (callback && v->reply_callback && callback != v->reply_callback)
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(EBUSY), "A different callback was already set.");
+
+ v->reply_callback = callback;
+
+ return 0;
+}
+
+void* varlink_set_userdata(Varlink *v, void *userdata) {
+ void *old;
+
+ assert_return(v, NULL);
+
+ old = v->userdata;
+ v->userdata = userdata;
+
+ return old;
+}
+
+void* varlink_get_userdata(Varlink *v) {
+ assert_return(v, NULL);
+
+ return v->userdata;
+}
+
+static int varlink_acquire_ucred(Varlink *v) {
+ int r;
+
+ assert(v);
+
+ if (v->ucred_acquired)
+ return 0;
+
+ r = getpeercred(v->fd, &v->ucred);
+ if (r < 0)
+ return r;
+
+ v->ucred_acquired = true;
+ return 0;
+}
+
+int varlink_get_peer_uid(Varlink *v, uid_t *ret) {
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(ret, -EINVAL);
+
+ r = varlink_acquire_ucred(v);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to acquire credentials: %m");
+
+ if (!uid_is_valid(v->ucred.uid))
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENODATA), "Peer uid is invalid.");
+
+ *ret = v->ucred.uid;
+ return 0;
+}
+
+int varlink_get_peer_pid(Varlink *v, pid_t *ret) {
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(ret, -EINVAL);
+
+ r = varlink_acquire_ucred(v);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to acquire credentials: %m");
+
+ if (!pid_is_valid(v->ucred.pid))
+ return varlink_log_errno(v, SYNTHETIC_ERRNO(ENODATA), "Peer uid is invalid.");
+
+ *ret = v->ucred.pid;
+ return 0;
+}
+
+int varlink_set_relative_timeout(Varlink *v, usec_t timeout) {
+ assert_return(v, -EINVAL);
+ assert_return(timeout > 0, -EINVAL);
+
+ v->timeout = timeout;
+ return 0;
+}
+
+VarlinkServer *varlink_get_server(Varlink *v) {
+ assert_return(v, NULL);
+
+ return v->server;
+}
+
+int varlink_set_description(Varlink *v, const char *description) {
+ assert_return(v, -EINVAL);
+
+ return free_and_strdup(&v->description, description);
+}
+
+static int io_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+ Varlink *v = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ handle_revents(v, revents);
+ (void) varlink_process(v);
+
+ return 1;
+}
+
+static int time_callback(sd_event_source *s, uint64_t usec, void *userdata) {
+ Varlink *v = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ (void) varlink_process(v);
+ return 1;
+}
+
+static int defer_callback(sd_event_source *s, void *userdata) {
+ Varlink *v = ASSERT_PTR(userdata);
+
+ assert(s);
+
+ (void) varlink_process(v);
+ return 1;
+}
+
+static int prepare_callback(sd_event_source *s, void *userdata) {
+ Varlink *v = ASSERT_PTR(userdata);
+ int r, e;
+ usec_t until;
+ bool have_timeout;
+
+ assert(s);
+
+ e = varlink_get_events(v);
+ if (e < 0)
+ return e;
+
+ r = sd_event_source_set_io_events(v->io_event_source, e);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to set source events: %m");
+
+ r = varlink_get_timeout(v, &until);
+ if (r < 0)
+ return r;
+ have_timeout = r > 0;
+
+ if (have_timeout) {
+ r = sd_event_source_set_time(v->time_event_source, until);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to set source time: %m");
+ }
+
+ r = sd_event_source_set_enabled(v->time_event_source, have_timeout ? SD_EVENT_ON : SD_EVENT_OFF);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to enable event source: %m");
+
+ return 1;
+}
+
+static int quit_callback(sd_event_source *event, void *userdata) {
+ Varlink *v = ASSERT_PTR(userdata);
+
+ assert(event);
+
+ varlink_flush(v);
+ varlink_close(v);
+
+ return 1;
+}
+
+int varlink_attach_event(Varlink *v, sd_event *e, int64_t priority) {
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(!v->event, -EBUSY);
+
+ if (e)
+ v->event = sd_event_ref(e);
+ else {
+ r = sd_event_default(&v->event);
+ if (r < 0)
+ return varlink_log_errno(v, r, "Failed to create event source: %m");
+ }
+
+ r = sd_event_add_time(v->event, &v->time_event_source, CLOCK_MONOTONIC, 0, 0, time_callback, v);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_priority(v->time_event_source, priority);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(v->time_event_source, "varlink-time");
+
+ r = sd_event_add_exit(v->event, &v->quit_event_source, quit_callback, v);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_priority(v->quit_event_source, priority);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(v->quit_event_source, "varlink-quit");
+
+ r = sd_event_add_io(v->event, &v->io_event_source, v->fd, 0, io_callback, v);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_prepare(v->io_event_source, prepare_callback);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_priority(v->io_event_source, priority);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(v->io_event_source, "varlink-io");
+
+ r = sd_event_add_defer(v->event, &v->defer_event_source, defer_callback, v);
+ if (r < 0)
+ goto fail;
+
+ r = sd_event_source_set_priority(v->defer_event_source, priority);
+ if (r < 0)
+ goto fail;
+
+ (void) sd_event_source_set_description(v->defer_event_source, "varlink-defer");
+
+ return 0;
+
+fail:
+ varlink_log_errno(v, r, "Failed to setup event source: %m");
+ varlink_detach_event(v);
+ return r;
+}
+
+void varlink_detach_event(Varlink *v) {
+ if (!v)
+ return;
+
+ varlink_detach_event_sources(v);
+
+ v->event = sd_event_unref(v->event);
+}
+
+sd_event *varlink_get_event(Varlink *v) {
+ assert_return(v, NULL);
+
+ return v->event;
+}
+
+int varlink_push_fd(Varlink *v, int fd) {
+ int i;
+
+ assert_return(v, -EINVAL);
+ assert_return(fd >= 0, -EBADF);
+
+ /* Takes an fd to send along with the *next* varlink message sent via this varlink connection. This
+ * takes ownership of the specified fd. Use varlink_dup_fd() below to duplicate the fd first. */
+
+ if (!v->allow_fd_passing_output)
+ return -EPERM;
+
+ if (v->n_pushed_fds >= INT_MAX)
+ return -ENOMEM;
+
+ if (!GREEDY_REALLOC(v->pushed_fds, v->n_pushed_fds + 1))
+ return -ENOMEM;
+
+ i = (int) v->n_pushed_fds;
+ v->pushed_fds[v->n_pushed_fds++] = fd;
+ return i;
+}
+
+int varlink_dup_fd(Varlink *v, int fd) {
+ _cleanup_close_ int dp = -1;
+ int r;
+
+ assert_return(v, -EINVAL);
+ assert_return(fd >= 0, -EBADF);
+
+ /* Like varlink_push_fd() but duplicates the specified fd instead of taking possession of it */
+
+ dp = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+ if (dp < 0)
+ return -errno;
+
+ r = varlink_push_fd(v, dp);
+ if (r < 0)
+ return r;
+
+ TAKE_FD(dp);
+ return r;
+}
+
+int varlink_reset_fds(Varlink *v) {
+ assert_return(v, -EINVAL);
+
+ /* Closes all currently pending fds to send. This may be used whenever the caller is in the process
+ * of putting together a message with fds, and then eventually something fails and they need to
+ * rollback the fds. Note that this is implicitly called whenever an error reply is sent, see above. */
+
+ close_many(v->output_fds, v->n_output_fds);
+ v->n_output_fds = 0;
+ return 0;
+}
+
+int varlink_peek_fd(Varlink *v, size_t i) {
+ assert_return(v, -EINVAL);
+
+ /* Returns one of the file descriptors that were received along with the current message. This does
+ * not duplicate the fd nor invalidate it, it hence remains in our possession. */
+
+ if (!v->allow_fd_passing_input)
+ return -EPERM;
+
+ if (i >= v->n_input_fds)
+ return -ENXIO;
+
+ return v->input_fds[i];
+}
+
+int varlink_take_fd(Varlink *v, size_t i) {
+ assert_return(v, -EINVAL);
+
+ /* Similar to varlink_peek_fd() but the file descriptor's ownership is passed to the caller, and
+ * we'll invalidate the reference to it under our possession. If called twice in a row will return
+ * -EBADF */
+
+ if (!v->allow_fd_passing_input)
+ return -EPERM;
+
+ if (i >= v->n_input_fds)
+ return -ENXIO;
+
+ return TAKE_FD(v->input_fds[i]);
+}
+
+static int verify_unix_socket(Varlink *v) {
+ assert(v);
+
+ if (v->af < 0) {
+ struct stat st;
+
+ if (fstat(v->fd, &st) < 0)
+ return -errno;
+ if (!S_ISSOCK(st.st_mode)) {
+ v->af = AF_UNSPEC;
+ return -ENOTSOCK;
+ }
+
+ v->af = socket_get_family(v->fd);
+ if (v->af < 0)
+ return v->af;
+ }
+
+ return v->af == AF_UNIX ? 0 : -ENOMEDIUM;
+}
+
+int varlink_set_allow_fd_passing_input(Varlink *v, bool b) {
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ if (v->allow_fd_passing_input == b)
+ return 0;
+
+ if (!b) {
+ v->allow_fd_passing_input = false;
+ return 1;
+ }
+
+ r = verify_unix_socket(v);
+ if (r < 0)
+ return r;
+
+ v->allow_fd_passing_input = true;
+ return 0;
+}
+
+int varlink_set_allow_fd_passing_output(Varlink *v, bool b) {
+ int r;
+
+ assert_return(v, -EINVAL);
+
+ if (v->allow_fd_passing_output == b)
+ return 0;
+
+ if (!b) {
+ v->allow_fd_passing_output = false;
+ return 1;
+ }
+
+ r = verify_unix_socket(v);
+ if (r < 0)
+ return r;
+
+ v->allow_fd_passing_output = true;
+ return 0;
+}
+
+int varlink_server_new(VarlinkServer **ret, VarlinkServerFlags flags) {
+ _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
+ int r;
+
+ assert_return(ret, -EINVAL);
+ assert_return((flags & ~_VARLINK_SERVER_FLAGS_ALL) == 0, -EINVAL);
+
+ s = new(VarlinkServer, 1);
+ if (!s)
+ return log_oom_debug();
+
+ *s = (VarlinkServer) {
+ .n_ref = 1,
+ .flags = flags,
+ .connections_max = varlink_server_connections_max(NULL),
+ .connections_per_uid_max = varlink_server_connections_per_uid_max(NULL),
+ };
+
+ r = varlink_server_add_interface_many(
+ s,
+ &vl_interface_io_systemd,
+ &vl_interface_org_varlink_service);
+ if (r < 0)
+ return r;
+
+ *ret = TAKE_PTR(s);
+ return 0;
+}
+
+static VarlinkServer* varlink_server_destroy(VarlinkServer *s) {
+ char *m;
+
+ if (!s)
+ return NULL;
+
+ varlink_server_shutdown(s);
+
+ while ((m = hashmap_steal_first_key(s->methods)))
+ free(m);
+
+ hashmap_free(s->methods);
+ hashmap_free(s->interfaces);
+ hashmap_free(s->symbols);
+ hashmap_free(s->by_uid);
+
+ sd_event_unref(s->event);
+
+ free(s->description);
+
+ return mfree(s);
+}
+
+DEFINE_TRIVIAL_REF_UNREF_FUNC(VarlinkServer, varlink_server, varlink_server_destroy);
+
+static int validate_connection(VarlinkServer *server, const struct ucred *ucred) {
+ int allowed = -1;
+
+ assert(server);
+ assert(ucred);
+
+ if (FLAGS_SET(server->flags, VARLINK_SERVER_ROOT_ONLY))
+ allowed = ucred->uid == 0;
+
+ if (FLAGS_SET(server->flags, VARLINK_SERVER_MYSELF_ONLY))
+ allowed = allowed > 0 || ucred->uid == getuid();
+
+ if (allowed == 0) { /* Allow access when it is explicitly allowed or when neither
+ * VARLINK_SERVER_ROOT_ONLY nor VARLINK_SERVER_MYSELF_ONLY are specified. */
+ varlink_server_log(server, "Unprivileged client attempted connection, refusing.");
+ return 0;
+ }
+
+ if (server->n_connections >= server->connections_max) {
+ varlink_server_log(server, "Connection limit of %u reached, refusing.", server->connections_max);
+ return 0;
+ }
+
+ if (FLAGS_SET(server->flags, VARLINK_SERVER_ACCOUNT_UID)) {
+ unsigned c;
+
+ if (!uid_is_valid(ucred->uid)) {
+ varlink_server_log(server, "Client with invalid UID attempted connection, refusing.");
+ return 0;
+ }
+
+ c = PTR_TO_UINT(hashmap_get(server->by_uid, UID_TO_PTR(ucred->uid)));
+ if (c >= server->connections_per_uid_max) {
+ varlink_server_log(server, "Per-UID connection limit of %u reached, refusing.",
+ server->connections_per_uid_max);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int count_connection(VarlinkServer *server, const struct ucred *ucred) {
+ unsigned c;
+ int r;
+
+ assert(server);
+ assert(ucred);
+
+ server->n_connections++;
+
+ if (FLAGS_SET(server->flags, VARLINK_SERVER_ACCOUNT_UID)) {
+ r = hashmap_ensure_allocated(&server->by_uid, NULL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to allocate UID hash table: %m");
+
+ c = PTR_TO_UINT(hashmap_get(server->by_uid, UID_TO_PTR(ucred->uid)));
+
+ varlink_server_log(server, "Connections of user " UID_FMT ": %u (of %u max)",
+ ucred->uid, c, server->connections_per_uid_max);
+
+ r = hashmap_replace(server->by_uid, UID_TO_PTR(ucred->uid), UINT_TO_PTR(c + 1));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to increment counter in UID hash table: %m");
+ }
+
+ return 0;
+}
+
+int varlink_server_add_connection(VarlinkServer *server, int fd, Varlink **ret) {
+ _cleanup_(varlink_unrefp) Varlink *v = NULL;
+ struct ucred ucred = UCRED_INVALID;
+ bool ucred_acquired;
+ int r;
+
+ assert_return(server, -EINVAL);
+ assert_return(fd >= 0, -EBADF);
+
+ if ((server->flags & (VARLINK_SERVER_ROOT_ONLY|VARLINK_SERVER_ACCOUNT_UID)) != 0) {
+ r = getpeercred(fd, &ucred);
+ if (r < 0)
+ return varlink_server_log_errno(server, r, "Failed to acquire peer credentials of incoming socket, refusing: %m");
+
+ ucred_acquired = true;
+
+ r = validate_connection(server, &ucred);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return -EPERM;
+ } else
+ ucred_acquired = false;
+
+ r = varlink_new(&v);
+ if (r < 0)
+ return varlink_server_log_errno(server, r, "Failed to allocate connection object: %m");
+
+ r = count_connection(server, &ucred);
+ if (r < 0)
+ return r;
+
+ v->fd = fd;
+ if (server->flags & VARLINK_SERVER_INHERIT_USERDATA)
+ v->userdata = server->userdata;
+
+ if (ucred_acquired) {
+ v->ucred = ucred;
+ v->ucred_acquired = true;
+ }
+
+ _cleanup_free_ char *desc = NULL;
+ if (asprintf(&desc, "%s-%i", server->description ?: "varlink", v->fd) >= 0)
+ v->description = TAKE_PTR(desc);
+
+ /* Link up the server and the connection, and take reference in both directions. Note that the
+ * reference on the connection is left dangling. It will be dropped when the connection is closed,
+ * which happens in varlink_close(), including in the event loop quit callback. */
+ v->server = varlink_server_ref(server);
+ varlink_ref(v);
+
+ varlink_set_state(v, VARLINK_IDLE_SERVER);
+
+ if (server->event) {
+ r = varlink_attach_event(v, server->event, server->event_priority);
+ if (r < 0) {
+ varlink_log_errno(v, r, "Failed to attach new connection: %m");
+ v->fd = -EBADF; /* take the fd out of the connection again */
+ varlink_close(v);
+ return r;
+ }
+ }
+
+ if (ret)
+ *ret = v;
+
+ return 0;
+}
+
+static VarlinkServerSocket *varlink_server_socket_free(VarlinkServerSocket *ss) {
+ if (!ss)
+ return NULL;
+
+ free(ss->address);
+ return mfree(ss);
+}
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkServerSocket *, varlink_server_socket_free);
+
+static int connect_callback(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ VarlinkServerSocket *ss = ASSERT_PTR(userdata);
+ _cleanup_close_ int cfd = -EBADF;
+ Varlink *v = NULL;
+ int r;
+
+ assert(source);
+
+ varlink_server_log(ss->server, "New incoming connection.");
+
+ cfd = accept4(fd, NULL, NULL, SOCK_NONBLOCK|SOCK_CLOEXEC);
+ if (cfd < 0) {
+ if (ERRNO_IS_ACCEPT_AGAIN(errno))
+ return 0;
+
+ return varlink_server_log_errno(ss->server, errno, "Failed to accept incoming socket: %m");
+ }
+
+ r = varlink_server_add_connection(ss->server, cfd, &v);
+ if (r < 0)
+ return 0;
+
+ TAKE_FD(cfd);
+
+ if (ss->server->connect_callback) {
+ r = ss->server->connect_callback(ss->server, v, ss->server->userdata);
+ if (r < 0) {
+ varlink_log_errno(v, r, "Connection callback returned error, disconnecting client: %m");
+ varlink_close(v);
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+static int varlink_server_create_listen_fd_socket(VarlinkServer *s, int fd, VarlinkServerSocket **ret_ss) {
+ _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL;
+ int r;
+
+ assert(s);
+ assert(fd >= 0);
+ assert(ret_ss);
+
+ ss = new(VarlinkServerSocket, 1);
+ if (!ss)
+ return log_oom_debug();
+
+ *ss = (VarlinkServerSocket) {
+ .server = s,
+ .fd = fd,
+ };
+
+ if (s->event) {
+ r = sd_event_add_io(s->event, &ss->event_source, fd, EPOLLIN, connect_callback, ss);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(ss->event_source, s->event_priority);
+ if (r < 0)
+ return r;
+ }
+
+ *ret_ss = TAKE_PTR(ss);
+ return 0;
+}
+
+int varlink_server_listen_fd(VarlinkServer *s, int fd) {
+ _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL;
+ int r;
+
+ assert_return(s, -EINVAL);
+ assert_return(fd >= 0, -EBADF);
+
+ r = fd_nonblock(fd, true);
+ if (r < 0)
+ return r;
+
+ r = fd_cloexec(fd, true);
+ if (r < 0)
+ return r;
+
+ r = varlink_server_create_listen_fd_socket(s, fd, &ss);
+ if (r < 0)
+ return r;
+
+ LIST_PREPEND(sockets, s->sockets, TAKE_PTR(ss));
+ return 0;
+}
+
+int varlink_server_listen_address(VarlinkServer *s, const char *address, mode_t m) {
+ _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL;
+ union sockaddr_union sockaddr;
+ socklen_t sockaddr_len;
+ _cleanup_close_ int fd = -EBADF;
+ int r;
+
+ assert_return(s, -EINVAL);
+ assert_return(address, -EINVAL);
+ assert_return((m & ~0777) == 0, -EINVAL);
+
+ r = sockaddr_un_set_path(&sockaddr.un, address);
+ if (r < 0)
+ return r;
+ sockaddr_len = r;
+
+ fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
+ if (fd < 0)
+ return -errno;
+
+ fd = fd_move_above_stdio(fd);
+
+ (void) sockaddr_un_unlink(&sockaddr.un);
+
+ WITH_UMASK(~m & 0777) {
+ r = mac_selinux_bind(fd, &sockaddr.sa, sockaddr_len);
+ if (r < 0)
+ return r;
+ }
+
+ if (listen(fd, SOMAXCONN_DELUXE) < 0)
+ return -errno;
+
+ r = varlink_server_create_listen_fd_socket(s, fd, &ss);
+ if (r < 0)
+ return r;
+
+ r = free_and_strdup(&ss->address, address);
+ if (r < 0)
+ return r;
+
+ LIST_PREPEND(sockets, s->sockets, TAKE_PTR(ss));
+ TAKE_FD(fd);
+ return 0;
+}
+
+int varlink_server_listen_auto(VarlinkServer *s) {
+ _cleanup_strv_free_ char **names = NULL;
+ int r, n = 0;
+
+ assert_return(s, -EINVAL);
+
+ /* Adds all passed fds marked as "varlink" to our varlink server. These fds can either refer to a
+ * listening socket or to a connection socket.
+ *
+ * See https://varlink.org/#activation for the environment variables this is backed by and the
+ * recommended "varlink" identifier in $LISTEN_FDNAMES. */
+
+ r = sd_listen_fds_with_names(/* unset_environment= */ false, &names);
+ if (r < 0)
+ return r;
+
+ for (int i = 0; i < r; i++) {
+ int b, fd;
+ socklen_t l = sizeof(b);
+
+ if (!streq(names[i], "varlink"))
+ continue;
+
+ fd = SD_LISTEN_FDS_START + i;
+
+ if (getsockopt(fd, SOL_SOCKET, SO_ACCEPTCONN, &b, &l) < 0)
+ return -errno;
+
+ assert(l == sizeof(b));
+
+ if (b) /* Listening socket? */
+ r = varlink_server_listen_fd(s, fd);
+ else /* Otherwise assume connection socket */
+ r = varlink_server_add_connection(s, fd, NULL);
+ if (r < 0)
+ return r;
+
+ n++;
+ }
+
+ return n;
+}
+
+void* varlink_server_set_userdata(VarlinkServer *s, void *userdata) {
+ void *ret;
+
+ assert_return(s, NULL);
+
+ ret = s->userdata;
+ s->userdata = userdata;
+
+ return ret;
+}
+
+void* varlink_server_get_userdata(VarlinkServer *s) {
+ assert_return(s, NULL);
+
+ return s->userdata;
+}
+
+int varlink_server_loop_auto(VarlinkServer *server) {
+ _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+ int r;
+
+ assert_return(server, -EINVAL);
+ assert_return(!server->event, -EBUSY);
+
+ /* Runs a Varlink service event loop populated with a passed fd. Exits on the last connection. */
+
+ r = sd_event_new(&event);
+ if (r < 0)
+ return r;
+
+ r = varlink_server_set_exit_on_idle(server, true);
+ if (r < 0)
+ return r;
+
+ r = varlink_server_attach_event(server, event, 0);
+ if (r < 0)
+ return r;
+
+ r = varlink_server_listen_auto(server);
+ if (r < 0)
+ return r;
+
+ return sd_event_loop(event);
+}
+
+static VarlinkServerSocket* varlink_server_socket_destroy(VarlinkServerSocket *ss) {
+ if (!ss)
+ return NULL;
+
+ if (ss->server)
+ LIST_REMOVE(sockets, ss->server->sockets, ss);
+
+ sd_event_source_disable_unref(ss->event_source);
+
+ free(ss->address);
+ safe_close(ss->fd);
+
+ return mfree(ss);
+}
+
+int varlink_server_shutdown(VarlinkServer *s) {
+ assert_return(s, -EINVAL);
+
+ while (s->sockets)
+ varlink_server_socket_destroy(s->sockets);
+
+ return 0;
+}
+
+static void varlink_server_test_exit_on_idle(VarlinkServer *s) {
+ assert(s);
+
+ if (s->exit_on_idle && s->event && s->n_connections == 0)
+ (void) sd_event_exit(s->event, 0);
+}
+
+int varlink_server_set_exit_on_idle(VarlinkServer *s, bool b) {
+ assert_return(s, -EINVAL);
+
+ s->exit_on_idle = b;
+ varlink_server_test_exit_on_idle(s);
+ return 0;
+}
+
+static int varlink_server_add_socket_event_source(VarlinkServer *s, VarlinkServerSocket *ss, int64_t priority) {
+ _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL;
+ int r;
+
+ assert(s);
+ assert(s->event);
+ assert(ss);
+ assert(ss->fd >= 0);
+ assert(!ss->event_source);
+
+ r = sd_event_add_io(s->event, &es, ss->fd, EPOLLIN, connect_callback, ss);
+ if (r < 0)
+ return r;
+
+ r = sd_event_source_set_priority(es, priority);
+ if (r < 0)
+ return r;
+
+ ss->event_source = TAKE_PTR(es);
+ return 0;
+}
+
+int varlink_server_attach_event(VarlinkServer *s, sd_event *e, int64_t priority) {
+ int r;
+
+ assert_return(s, -EINVAL);
+ assert_return(!s->event, -EBUSY);
+
+ if (e)
+ s->event = sd_event_ref(e);
+ else {
+ r = sd_event_default(&s->event);
+ if (r < 0)
+ return r;
+ }
+
+ LIST_FOREACH(sockets, ss, s->sockets) {
+ r = varlink_server_add_socket_event_source(s, ss, priority);
+ if (r < 0)
+ goto fail;
+ }
+
+ s->event_priority = priority;
+ return 0;
+
+fail:
+ varlink_server_detach_event(s);
+ return r;
+}
+
+int varlink_server_detach_event(VarlinkServer *s) {
+ assert_return(s, -EINVAL);
+
+ LIST_FOREACH(sockets, ss, s->sockets)
+ ss->event_source = sd_event_source_disable_unref(ss->event_source);
+
+ sd_event_unref(s->event);
+ return 0;
+}
+
+sd_event *varlink_server_get_event(VarlinkServer *s) {
+ assert_return(s, NULL);
+
+ return s->event;
+}
+
+static bool varlink_symbol_in_interface(const char *method, const char *interface) {
+ const char *p;
+
+ assert(method);
+ assert(interface);
+
+ p = startswith(method, interface);
+ if (!p)
+ return false;
+
+ if (*p != '.')
+ return false;
+
+ return !strchr(p+1, '.');
+}
+
+int varlink_server_bind_method(VarlinkServer *s, const char *method, VarlinkMethod callback) {
+ _cleanup_free_ char *m = NULL;
+ int r;
+
+ assert_return(s, -EINVAL);
+ assert_return(method, -EINVAL);
+ assert_return(callback, -EINVAL);
+
+ if (varlink_symbol_in_interface(method, "org.varlink.service") ||
+ varlink_symbol_in_interface(method, "io.systemd"))
+ return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), "Cannot bind server to '%s'.", method);
+
+ m = strdup(method);
+ if (!m)
+ return log_oom_debug();
+
+ r = hashmap_ensure_put(&s->methods, &string_hash_ops, m, callback);
+ if (r == -ENOMEM)
+ return log_oom_debug();
+ if (r < 0)
+ return log_debug_errno(r, "Failed to register callback: %m");
+ if (r > 0)
+ TAKE_PTR(m);
+
+ return 0;
+}
+
+int varlink_server_bind_method_many_internal(VarlinkServer *s, ...) {
+ va_list ap;
+ int r = 0;
+
+ assert_return(s, -EINVAL);
+
+ va_start(ap, s);
+ for (;;) {
+ VarlinkMethod callback;
+ const char *method;
+
+ method = va_arg(ap, const char *);
+ if (!method)
+ break;
+
+ callback = va_arg(ap, VarlinkMethod);
+
+ r = varlink_server_bind_method(s, method, callback);
+ if (r < 0)
+ break;
+ }
+ va_end(ap);
+
+ return r;
+}
+
+int varlink_server_bind_connect(VarlinkServer *s, VarlinkConnect callback) {
+ assert_return(s, -EINVAL);
+
+ if (callback && s->connect_callback && callback != s->connect_callback)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "A different callback was already set.");
+
+ s->connect_callback = callback;
+ return 0;
+}
+
+int varlink_server_bind_disconnect(VarlinkServer *s, VarlinkDisconnect callback) {
+ assert_return(s, -EINVAL);
+
+ if (callback && s->disconnect_callback && callback != s->disconnect_callback)
+ return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), "A different callback was already set.");
+
+ s->disconnect_callback = callback;
+ return 0;
+}
+
+int varlink_server_add_interface(VarlinkServer *s, const VarlinkInterface *interface) {
+ int r;
+
+ assert_return(s, -EINVAL);
+ assert_return(interface, -EINVAL);
+ assert_return(interface->name, -EINVAL);
+
+ if (hashmap_contains(s->interfaces, interface->name))
+ return log_debug_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate registration of interface '%s'.", interface->name);
+
+ r = hashmap_ensure_put(&s->interfaces, &string_hash_ops, interface->name, (void*) interface);
+ if (r < 0)
+ return r;
+
+ for (const VarlinkSymbol *const*symbol = interface->symbols; *symbol; symbol++) {
+ _cleanup_free_ char *j = NULL;
+
+ /* We only ever want to validate method calls/replies and errors against the interface
+ * definitions, hence don't bother with the type symbols */
+ if (!IN_SET((*symbol)->symbol_type, VARLINK_METHOD, VARLINK_ERROR))
+ continue;
+
+ j = strjoin(interface->name, ".", (*symbol)->name);
+ if (!j)
+ return -ENOMEM;
+
+ r = hashmap_ensure_put(&s->symbols, &string_hash_ops_free, j, (void*) *symbol);
+ if (r < 0)
+ return r;
+
+ TAKE_PTR(j);
+ }
+
+ return 0;
+}
+
+int varlink_server_add_interface_many_internal(VarlinkServer *s, ...) {
+ va_list ap;
+ int r = 0;
+
+ assert_return(s, -EINVAL);
+
+ va_start(ap, s);
+ for (;;) {
+ const VarlinkInterface *interface = va_arg(ap, const VarlinkInterface*);
+ if (!interface)
+ break;
+
+ r = varlink_server_add_interface(s, interface);
+ if (r < 0)
+ break;
+ }
+ va_end(ap);
+
+ return r;
+}
+
+unsigned varlink_server_connections_max(VarlinkServer *s) {
+ int dts;
+
+ /* If a server is specified, return the setting for that server, otherwise the default value */
+ if (s)
+ return s->connections_max;
+
+ dts = getdtablesize();
+ assert_se(dts > 0);
+
+ /* Make sure we never use up more than ¾th of RLIMIT_NOFILE for IPC */
+ if (VARLINK_DEFAULT_CONNECTIONS_MAX > (unsigned) dts / 4 * 3)
+ return dts / 4 * 3;
+
+ return VARLINK_DEFAULT_CONNECTIONS_MAX;
+}
+
+unsigned varlink_server_connections_per_uid_max(VarlinkServer *s) {
+ unsigned m;
+
+ if (s)
+ return s->connections_per_uid_max;
+
+ /* Make sure to never use up more than ¾th of available connections for a single user */
+ m = varlink_server_connections_max(NULL);
+ if (VARLINK_DEFAULT_CONNECTIONS_PER_UID_MAX > m)
+ return m / 4 * 3;
+
+ return VARLINK_DEFAULT_CONNECTIONS_PER_UID_MAX;
+}
+
+int varlink_server_set_connections_per_uid_max(VarlinkServer *s, unsigned m) {
+ assert_return(s, -EINVAL);
+ assert_return(m > 0, -EINVAL);
+
+ s->connections_per_uid_max = m;
+ return 0;
+}
+
+int varlink_server_set_connections_max(VarlinkServer *s, unsigned m) {
+ assert_return(s, -EINVAL);
+ assert_return(m > 0, -EINVAL);
+
+ s->connections_max = m;
+ return 0;
+}
+
+unsigned varlink_server_current_connections(VarlinkServer *s) {
+ assert_return(s, UINT_MAX);
+
+ return s->n_connections;
+}
+
+int varlink_server_set_description(VarlinkServer *s, const char *description) {
+ assert_return(s, -EINVAL);
+
+ return free_and_strdup(&s->description, description);
+}
+
+int varlink_server_serialize(VarlinkServer *s, FILE *f, FDSet *fds) {
+ assert(f);
+ assert(fds);
+
+ if (!s)
+ return 0;
+
+ LIST_FOREACH(sockets, ss, s->sockets) {
+ int copy;
+
+ assert(ss->address);
+ assert(ss->fd >= 0);
+
+ fprintf(f, "varlink-server-socket-address=%s", ss->address);
+
+ /* If we fail to serialize the fd, it will be considered an error during deserialization */
+ copy = fdset_put_dup(fds, ss->fd);
+ if (copy < 0)
+ return copy;
+
+ fprintf(f, " varlink-server-socket-fd=%i", copy);
+
+ fputc('\n', f);
+ }
+
+ return 0;
+}
+
+int varlink_server_deserialize_one(VarlinkServer *s, const char *value, FDSet *fds) {
+ _cleanup_(varlink_server_socket_freep) VarlinkServerSocket *ss = NULL;
+ _cleanup_free_ char *address = NULL;
+ const char *v = ASSERT_PTR(value);
+ int r, fd = -EBADF;
+ char *buf;
+ size_t n;
+
+ assert(s);
+ assert(fds);
+
+ n = strcspn(v, " ");
+ address = strndup(v, n);
+ if (!address)
+ return log_oom_debug();
+
+ if (v[n] != ' ')
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to deserialize VarlinkServerSocket: %s: %m", value);
+ v = startswith(v + n + 1, "varlink-server-socket-fd=");
+ if (!v)
+ return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Failed to deserialize VarlinkServerSocket fd %s: %m", value);
+
+ n = strcspn(v, " ");
+ buf = strndupa_safe(v, n);
+
+ fd = parse_fd(buf);
+ if (fd < 0)
+ return log_debug_errno(fd, "Unable to parse VarlinkServerSocket varlink-server-socket-fd=%s: %m", buf);
+ if (!fdset_contains(fds, fd))
+ return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
+ "VarlinkServerSocket varlink-server-socket-fd= has unknown fd %d: %m", fd);
+
+ ss = new(VarlinkServerSocket, 1);
+ if (!ss)
+ return log_oom_debug();
+
+ *ss = (VarlinkServerSocket) {
+ .server = s,
+ .address = TAKE_PTR(address),
+ .fd = fdset_remove(fds, fd),
+ };
+
+ r = varlink_server_add_socket_event_source(s, ss, SD_EVENT_PRIORITY_NORMAL);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to add VarlinkServerSocket event source to the event loop: %m");
+
+ LIST_PREPEND(sockets, s->sockets, TAKE_PTR(ss));
+ return 0;
+}
+
+int varlink_invocation(VarlinkInvocationFlags flags) {
+ _cleanup_strv_free_ char **names = NULL;
+ int r, b;
+ socklen_t l = sizeof(b);
+
+ /* Returns true if this is a "pure" varlink server invocation, i.e. with one fd passed. */
+
+ r = sd_listen_fds_with_names(/* unset_environment= */ false, &names);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ return false;
+ if (r > 1)
+ return -ETOOMANYREFS;
+
+ if (!strv_equal(names, STRV_MAKE("varlink")))
+ return false;
+
+ if (FLAGS_SET(flags, VARLINK_ALLOW_LISTEN|VARLINK_ALLOW_ACCEPT)) /* Both flags set? Then allow everything */
+ return true;
+
+ if ((flags & (VARLINK_ALLOW_LISTEN|VARLINK_ALLOW_ACCEPT)) == 0) /* Neither is set, then fail */
+ return -EISCONN;
+
+ if (getsockopt(SD_LISTEN_FDS_START, SOL_SOCKET, SO_ACCEPTCONN, &b, &l) < 0)
+ return -errno;
+
+ assert(l == sizeof(b));
+
+ if (!FLAGS_SET(flags, b ? VARLINK_ALLOW_LISTEN : VARLINK_ALLOW_ACCEPT))
+ return -EISCONN;
+
+ return true;
+}
diff --git a/src/shared/varlink.h b/src/shared/varlink.h
new file mode 100644
index 0000000..6ec708a
--- /dev/null
+++ b/src/shared/varlink.h
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include "sd-event.h"
+
+#include "json.h"
+#include "time-util.h"
+#include "varlink-idl.h"
+
+/* A minimal Varlink implementation. We only implement the minimal, obvious bits here though. No validation,
+ * no introspection, no name service, just the stuff actually needed.
+ *
+ * You might wonder why we aren't using libvarlink here? Varlink is a very simple protocol, which allows us
+ * to write our own implementation relatively easily. However, the main reasons are these:
+ *
+ * • We want to use our own JSON subsystem, with all the benefits that brings (i.e. accurate unsigned+signed
+ * 64-bit integers, full fuzzing, logging during parsing and so on). If we'd want to use that with
+ * libvarlink we'd have to serialize and deserialize all the time from its own representation which is
+ * inefficient and nasty.
+ *
+ * • We want integration into sd-event, but also synchronous event-loop-less operation
+ *
+ * • We need proper per-UID accounting and access control, since we want to allow communication between
+ * unprivileged clients and privileged servers.
+ *
+ * • And of course, we don't want the name service and introspection stuff for now (though that might
+ * change).
+ */
+
+typedef struct Varlink Varlink;
+typedef struct VarlinkServer VarlinkServer;
+
+typedef enum VarlinkReplyFlags {
+ VARLINK_REPLY_ERROR = 1 << 0,
+ VARLINK_REPLY_CONTINUES = 1 << 1,
+ VARLINK_REPLY_LOCAL = 1 << 2,
+} VarlinkReplyFlags;
+
+typedef enum VarlinkMethodFlags {
+ VARLINK_METHOD_ONEWAY = 1 << 0,
+ VARLINK_METHOD_MORE = 2 << 1,
+} VarlinkMethodFlags;
+
+typedef enum VarlinkServerFlags {
+ VARLINK_SERVER_ROOT_ONLY = 1 << 0, /* Only accessible by root */
+ VARLINK_SERVER_MYSELF_ONLY = 1 << 1, /* Only accessible by our own UID */
+ VARLINK_SERVER_ACCOUNT_UID = 1 << 2, /* Do per user accounting */
+ VARLINK_SERVER_INHERIT_USERDATA = 1 << 3, /* Initialize Varlink connection userdata from VarlinkServer userdata */
+ _VARLINK_SERVER_FLAGS_ALL = (1 << 4) - 1,
+} VarlinkServerFlags;
+
+typedef int (*VarlinkMethod)(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata);
+typedef int (*VarlinkReply)(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata);
+typedef int (*VarlinkConnect)(VarlinkServer *server, Varlink *link, void *userdata);
+typedef void (*VarlinkDisconnect)(VarlinkServer *server, Varlink *link, void *userdata);
+
+int varlink_connect_address(Varlink **ret, const char *address);
+int varlink_connect_exec(Varlink **ret, const char *command, char **argv);
+int varlink_connect_url(Varlink **ret, const char *url);
+int varlink_connect_fd(Varlink **ret, int fd);
+
+Varlink* varlink_ref(Varlink *link);
+Varlink* varlink_unref(Varlink *v);
+
+int varlink_get_fd(Varlink *v);
+int varlink_get_events(Varlink *v);
+int varlink_get_timeout(Varlink *v, usec_t *ret);
+
+int varlink_attach_event(Varlink *v, sd_event *e, int64_t priority);
+void varlink_detach_event(Varlink *v);
+sd_event *varlink_get_event(Varlink *v);
+
+int varlink_process(Varlink *v);
+int varlink_wait(Varlink *v, usec_t timeout);
+
+int varlink_is_idle(Varlink *v);
+
+int varlink_flush(Varlink *v);
+int varlink_close(Varlink *v);
+
+Varlink* varlink_flush_close_unref(Varlink *v);
+Varlink* varlink_close_unref(Varlink *v);
+
+/* Enqueue method call, not expecting a reply */
+int varlink_send(Varlink *v, const char *method, JsonVariant *parameters);
+int varlink_sendb(Varlink *v, const char *method, ...);
+
+/* Send method call and wait for reply */
+int varlink_call(Varlink *v, const char *method, JsonVariant *parameters, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags);
+int varlink_callb(Varlink *v, const char *method, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags, ...);
+
+/* Send method call and begin collecting all 'more' replies into an array, finishing when a final reply is sent */
+int varlink_collect(Varlink *v, const char *method, JsonVariant *parameters, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags);
+int varlink_collectb(Varlink *v, const char *method, JsonVariant **ret_parameters, const char **ret_error_id, VarlinkReplyFlags *ret_flags, ...);
+
+/* Enqueue method call, expect a reply, which is eventually delivered to the reply callback */
+int varlink_invoke(Varlink *v, const char *method, JsonVariant *parameters);
+int varlink_invokeb(Varlink *v, const char *method, ...);
+
+/* Enqueue method call, expect a reply now, and possibly more later, which are all delivered to the reply callback */
+int varlink_observe(Varlink *v, const char *method, JsonVariant *parameters);
+int varlink_observeb(Varlink *v, const char *method, ...);
+
+/* Enqueue a final reply */
+int varlink_reply(Varlink *v, JsonVariant *parameters);
+int varlink_replyb(Varlink *v, ...);
+
+/* Enqueue a (final) error */
+int varlink_error(Varlink *v, const char *error_id, JsonVariant *parameters);
+int varlink_errorb(Varlink *v, const char *error_id, ...);
+int varlink_error_invalid_parameter(Varlink *v, JsonVariant *parameters);
+int varlink_error_errno(Varlink *v, int error);
+
+/* Enqueue a "more" reply */
+int varlink_notify(Varlink *v, JsonVariant *parameters);
+int varlink_notifyb(Varlink *v, ...);
+
+/* Parsing incoming data via json_dispatch() and generate a nice error on parse errors */
+int varlink_dispatch(Varlink *v, JsonVariant *parameters, const JsonDispatch table[], void *userdata);
+
+/* Write outgoing fds into the socket (to be associated with the next enqueued message) */
+int varlink_push_fd(Varlink *v, int fd);
+int varlink_dup_fd(Varlink *v, int fd);
+int varlink_reset_fds(Varlink *v);
+
+/* Read incoming fds from the socket (associated with the currently handled message) */
+int varlink_peek_fd(Varlink *v, size_t i);
+int varlink_take_fd(Varlink *v, size_t i);
+
+int varlink_set_allow_fd_passing_input(Varlink *v, bool b);
+int varlink_set_allow_fd_passing_output(Varlink *v, bool b);
+
+/* Bind a disconnect, reply or timeout callback */
+int varlink_bind_reply(Varlink *v, VarlinkReply reply);
+
+void* varlink_set_userdata(Varlink *v, void *userdata);
+void* varlink_get_userdata(Varlink *v);
+
+int varlink_get_peer_uid(Varlink *v, uid_t *ret);
+int varlink_get_peer_pid(Varlink *v, pid_t *ret);
+
+int varlink_set_relative_timeout(Varlink *v, usec_t usec);
+
+VarlinkServer* varlink_get_server(Varlink *v);
+
+int varlink_set_description(Varlink *v, const char *d);
+
+/* Create a varlink server */
+int varlink_server_new(VarlinkServer **ret, VarlinkServerFlags flags);
+VarlinkServer *varlink_server_ref(VarlinkServer *s);
+VarlinkServer *varlink_server_unref(VarlinkServer *s);
+
+/* Add addresses or fds to listen on */
+int varlink_server_listen_address(VarlinkServer *s, const char *address, mode_t mode);
+int varlink_server_listen_fd(VarlinkServer *s, int fd);
+int varlink_server_listen_auto(VarlinkServer *s);
+int varlink_server_add_connection(VarlinkServer *s, int fd, Varlink **ret);
+
+/* Bind callbacks */
+int varlink_server_bind_method(VarlinkServer *s, const char *method, VarlinkMethod callback);
+int varlink_server_bind_method_many_internal(VarlinkServer *s, ...);
+#define varlink_server_bind_method_many(s, ...) varlink_server_bind_method_many_internal(s, __VA_ARGS__, NULL)
+int varlink_server_bind_connect(VarlinkServer *s, VarlinkConnect connect);
+int varlink_server_bind_disconnect(VarlinkServer *s, VarlinkDisconnect disconnect);
+
+/* Add interface definition */
+int varlink_server_add_interface(VarlinkServer *s, const VarlinkInterface *interface);
+int varlink_server_add_interface_many_internal(VarlinkServer *s, ...);
+#define varlink_server_add_interface_many(s, ...) varlink_server_add_interface_many_internal(s, __VA_ARGS__, NULL)
+
+void* varlink_server_set_userdata(VarlinkServer *s, void *userdata);
+void* varlink_server_get_userdata(VarlinkServer *s);
+
+int varlink_server_attach_event(VarlinkServer *v, sd_event *e, int64_t priority);
+int varlink_server_detach_event(VarlinkServer *v);
+sd_event *varlink_server_get_event(VarlinkServer *v);
+
+int varlink_server_loop_auto(VarlinkServer *server);
+
+int varlink_server_shutdown(VarlinkServer *server);
+
+int varlink_server_set_exit_on_idle(VarlinkServer *s, bool b);
+
+unsigned varlink_server_connections_max(VarlinkServer *s);
+unsigned varlink_server_connections_per_uid_max(VarlinkServer *s);
+
+int varlink_server_set_connections_per_uid_max(VarlinkServer *s, unsigned m);
+int varlink_server_set_connections_max(VarlinkServer *s, unsigned m);
+
+unsigned varlink_server_current_connections(VarlinkServer *s);
+
+int varlink_server_set_description(VarlinkServer *s, const char *description);
+
+typedef enum VarlinkInvocationFlags {
+ VARLINK_ALLOW_LISTEN = 1 << 0,
+ VARLINK_ALLOW_ACCEPT = 1 << 1,
+ _VARLINK_SERVER_INVOCATION_FLAGS_MAX = (1 << 2) - 1,
+ _VARLINK_SERVER_INVOCATION_FLAGS_INVALID = -EINVAL,
+} VarlinkInvocationFlags;
+
+int varlink_invocation(VarlinkInvocationFlags flags);
+
+DEFINE_TRIVIAL_CLEANUP_FUNC(Varlink *, varlink_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Varlink *, varlink_close_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC(Varlink *, varlink_flush_close_unref);
+DEFINE_TRIVIAL_CLEANUP_FUNC(VarlinkServer *, varlink_server_unref);
+
+/* These are local errors that never cross the wire, and are our own invention */
+#define VARLINK_ERROR_DISCONNECTED "io.systemd.Disconnected"
+#define VARLINK_ERROR_TIMEOUT "io.systemd.TimedOut"
+#define VARLINK_ERROR_PROTOCOL "io.systemd.Protocol"
+
+/* This one we invented, and use for generically propagating system errors (errno) to clients */
+#define VARLINK_ERROR_SYSTEM "io.systemd.System"
+
+/* These are errors defined in the Varlink spec */
+#define VARLINK_ERROR_INTERFACE_NOT_FOUND "org.varlink.service.InterfaceNotFound"
+#define VARLINK_ERROR_METHOD_NOT_FOUND "org.varlink.service.MethodNotFound"
+#define VARLINK_ERROR_METHOD_NOT_IMPLEMENTED "org.varlink.service.MethodNotImplemented"
+#define VARLINK_ERROR_INVALID_PARAMETER "org.varlink.service.InvalidParameter"
+
+/* These are errors we came up with and squatted the namespace with */
+#define VARLINK_ERROR_PERMISSION_DENIED "org.varlink.service.PermissionDenied"
+#define VARLINK_ERROR_EXPECTED_MORE "org.varlink.service.ExpectedMore"
diff --git a/src/shared/verb-log-control.c b/src/shared/verb-log-control.c
new file mode 100644
index 0000000..555fb9f
--- /dev/null
+++ b/src/shared/verb-log-control.c
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "alloc-util.h"
+#include "bus-error.h"
+#include "log.h"
+#include "strv.h"
+#include "syslog-util.h"
+#include "verb-log-control.h"
+
+int verb_log_control_common(sd_bus *bus, const char *destination, const char *verb, const char *value) {
+ _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
+ bool level = endswith(verb, "log-level");
+ const BusLocator bloc = {
+ .destination = destination,
+ .path = "/org/freedesktop/LogControl1",
+ .interface = "org.freedesktop.LogControl1",
+ };
+ int r;
+
+ assert(bus);
+ assert(endswith(verb, "log-level") || endswith(verb, "log-target"));
+
+ if (value) {
+ if (level) {
+ r = log_level_from_string(value);
+ if (r < 0)
+ return log_error_errno(r, "\"%s\" is not a valid log level.", value);
+ }
+
+ r = bus_set_property(bus, &bloc,
+ level ? "LogLevel" : "LogTarget",
+ &error, "s", value);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set log %s of %s to %s: %s",
+ level ? "level" : "target",
+ bloc.destination, value, bus_error_message(&error, r));
+ } else {
+ _cleanup_free_ char *t = NULL;
+
+ r = bus_get_property_string(bus, &bloc,
+ level ? "LogLevel" : "LogTarget",
+ &error, &t);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get log %s of %s: %s",
+ level ? "level" : "target",
+ bloc.destination, bus_error_message(&error, r));
+ puts(t);
+ }
+
+ return 0;
+}
diff --git a/src/shared/verb-log-control.h b/src/shared/verb-log-control.h
new file mode 100644
index 0000000..b9e7cdd
--- /dev/null
+++ b/src/shared/verb-log-control.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "bus-locator.h"
+
+int verb_log_control_common(sd_bus *bus, const char *destination, const char *verb, const char *value);
diff --git a/src/shared/verbs.c b/src/shared/verbs.c
new file mode 100644
index 0000000..a010952
--- /dev/null
+++ b/src/shared/verbs.c
@@ -0,0 +1,171 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "env-util.h"
+#include "log.h"
+#include "macro.h"
+#include "process-util.h"
+#include "string-util.h"
+#include "verbs.h"
+#include "virt.h"
+
+/* Wraps running_in_chroot() which is used in various places, but also adds an environment variable check so external
+ * processes can reliably force this on.
+ */
+bool running_in_chroot_or_offline(void) {
+ int r;
+
+ /* Added to support use cases like rpm-ostree, where from %post scripts we only want to execute "preset", but
+ * not "start"/"restart" for example.
+ *
+ * See docs/ENVIRONMENT.md for docs.
+ */
+ r = getenv_bool("SYSTEMD_OFFLINE");
+ if (r < 0 && r != -ENXIO)
+ log_debug_errno(r, "Failed to parse $SYSTEMD_OFFLINE: %m");
+ else if (r >= 0)
+ return r > 0;
+
+ /* We've had this condition check for a long time which basically checks for legacy chroot case like Fedora's
+ * "mock", which is used for package builds. We don't want to try to start systemd services there, since
+ * without --new-chroot we don't even have systemd running, and even if we did, adding a concept of background
+ * daemons to builds would be an enormous change, requiring considering things like how the journal output is
+ * handled, etc. And there's really not a use case today for a build talking to a service.
+ *
+ * Note this call itself also looks for a different variable SYSTEMD_IGNORE_CHROOT=1.
+ */
+ r = running_in_chroot();
+ if (r < 0)
+ log_debug_errno(r, "running_in_chroot(): %m");
+
+ return r > 0;
+}
+
+const Verb* verbs_find_verb(const char *name, const Verb verbs[]) {
+ assert(verbs);
+
+ for (size_t i = 0; verbs[i].dispatch; i++)
+ if (name ? streq(name, verbs[i].verb) : FLAGS_SET(verbs[i].flags, VERB_DEFAULT))
+ return verbs + i;
+
+ /* At the end of the list? */
+ return NULL;
+}
+
+static const Verb* verbs_find_prefix_verb(const char *name, const Verb verbs[]) {
+ size_t best_distance = SIZE_MAX;
+ const Verb *best = NULL;
+
+ assert(verbs);
+
+ if (!name)
+ return NULL;
+
+ for (size_t i = 0; verbs[i].dispatch; i++) {
+ const char *e;
+ size_t l;
+
+ e = startswith(verbs[i].verb, name);
+ if (!e)
+ continue;
+
+ l = strlen(e);
+ if (l < best_distance) {
+ best_distance = l;
+ best = verbs + i;
+ }
+ }
+
+ return best;
+}
+
+static const Verb* verbs_find_closest_verb(const char *name, const Verb verbs[]) {
+ ssize_t best_distance = SSIZE_MAX;
+ const Verb *best = NULL;
+
+ assert(verbs);
+
+ if (!name)
+ return NULL;
+
+ for (size_t i = 0; verbs[i].dispatch; i++) {
+ ssize_t distance;
+
+ distance = strlevenshtein(verbs[i].verb, name);
+ if (distance < 0) {
+ log_debug_errno(distance, "Failed to determine Levenshtein distance between %s and %s: %m", verbs[i].verb, name);
+ return NULL;
+ }
+
+ if (distance > 5) /* If the distance is just too far off, don't make a bad suggestion */
+ continue;
+
+ if (distance < best_distance) {
+ best_distance = distance;
+ best = verbs + i;
+ }
+ }
+
+ return best;
+}
+
+int dispatch_verb(int argc, char *argv[], const Verb verbs[], void *userdata) {
+ const Verb *verb;
+ const char *name;
+ int left;
+
+ assert(verbs);
+ assert(verbs[0].dispatch);
+ assert(argc >= 0);
+ assert(argv);
+ assert(argc >= optind);
+
+ left = argc - optind;
+ argv += optind;
+ optind = 0;
+ name = argv[0];
+
+ verb = verbs_find_verb(name, verbs);
+ if (!verb) {
+ if (name) {
+ /* Be helperful to the user, and give a hint what the user might have wanted to
+ * type. We search with two mechanisms: a simple prefix match and – if that didn't
+ * yield results –, a Levenshtein word distance based match. */
+ verb = verbs_find_prefix_verb(name, verbs);
+ if (!verb)
+ verb = verbs_find_closest_verb(name, verbs);
+ if (verb)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Unknown command verb '%s', did you mean '%s'?", name, verb->verb);
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown command verb '%s'.", name);
+ }
+
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Command verb required.");
+ }
+
+ if (!name)
+ left = 1;
+
+ if (verb->min_args != VERB_ANY &&
+ (unsigned) left < verb->min_args)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too few arguments.");
+
+ if (verb->max_args != VERB_ANY &&
+ (unsigned) left > verb->max_args)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too many arguments.");
+
+ if ((verb->flags & VERB_ONLINE_ONLY) && running_in_chroot_or_offline()) {
+ log_info("Running in chroot, ignoring command '%s'", name ?: verb->verb);
+ return 0;
+ }
+
+ if (!name)
+ return verb->dispatch(1, STRV_MAKE(verb->verb), userdata);
+
+ return verb->dispatch(left, argv, userdata);
+ }
diff --git a/src/shared/verbs.h b/src/shared/verbs.h
new file mode 100644
index 0000000..03819e3
--- /dev/null
+++ b/src/shared/verbs.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#define VERB_ANY (UINT_MAX)
+
+typedef enum VerbFlags {
+ VERB_DEFAULT = 1 << 0, /* The verb to run if no verb is specified */
+ VERB_ONLINE_ONLY = 1 << 1, /* Just do nothing when running in chroot or offline */
+} VerbFlags;
+
+typedef struct {
+ const char *verb;
+ unsigned min_args, max_args;
+ VerbFlags flags;
+ int (* const dispatch)(int argc, char *argv[], void *userdata);
+} Verb;
+
+bool running_in_chroot_or_offline(void);
+
+const Verb* verbs_find_verb(const char *name, const Verb verbs[]);
+int dispatch_verb(int argc, char *argv[], const Verb verbs[], void *userdata);
diff --git a/src/shared/vlan-util.c b/src/shared/vlan-util.c
new file mode 100644
index 0000000..17f2d39
--- /dev/null
+++ b/src/shared/vlan-util.c
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "conf-parser.h"
+#include "parse-util.h"
+#include "string-util.h"
+#include "vlan-util.h"
+
+int parse_vlanid(const char *p, uint16_t *ret) {
+ uint16_t id;
+ int r;
+
+ assert(p);
+ assert(ret);
+
+ r = safe_atou16(p, &id);
+ if (r < 0)
+ return r;
+ if (!vlanid_is_valid(id))
+ return -ERANGE;
+
+ *ret = id;
+ return 0;
+}
+
+int parse_vid_range(const char *p, uint16_t *vid, uint16_t *vid_end) {
+ unsigned lower, upper;
+ int r;
+
+ r = parse_range(p, &lower, &upper);
+ if (r < 0)
+ return r;
+
+ if (lower > VLANID_MAX || upper > VLANID_MAX || lower > upper)
+ return -EINVAL;
+
+ *vid = lower;
+ *vid_end = upper;
+ return 0;
+}
+
+int config_parse_default_port_vlanid(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+ uint16_t *id = ASSERT_PTR(data);
+
+ assert(lvalue);
+ assert(rvalue);
+
+ if (streq(rvalue, "none")) {
+ *id = 0;
+ return 0;
+ }
+
+ return config_parse_vlanid(unit, filename, line, section, section_line,
+ lvalue, ltype, rvalue, data, userdata);
+}
+
+int config_parse_vlanid(
+ const char *unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ uint16_t *id = ASSERT_PTR(data);
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ r = parse_vlanid(rvalue, id);
+ if (r == -ERANGE) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "VLAN identifier outside of valid range 0…4094, ignoring: %s", rvalue);
+ return 0;
+ }
+ if (r < 0) {
+ log_syntax(unit, LOG_WARNING, filename, line, r,
+ "Failed to parse VLAN identifier value, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ return 0;
+}
diff --git a/src/shared/vlan-util.h b/src/shared/vlan-util.h
new file mode 100644
index 0000000..0336908
--- /dev/null
+++ b/src/shared/vlan-util.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "conf-parser.h"
+
+#define VLANID_MAX 4094
+#define VLANID_INVALID UINT16_MAX
+
+/* Note that we permit VLAN Id 0 here, as that is apparently OK by the Linux kernel */
+static inline bool vlanid_is_valid(uint16_t id) {
+ return id <= VLANID_MAX;
+}
+
+int parse_vlanid(const char *p, uint16_t *ret);
+int parse_vid_range(const char *p, uint16_t *vid, uint16_t *vid_end);
+
+CONFIG_PARSER_PROTOTYPE(config_parse_default_port_vlanid);
+CONFIG_PARSER_PROTOTYPE(config_parse_vlanid);
diff --git a/src/shared/volatile-util.c b/src/shared/volatile-util.c
new file mode 100644
index 0000000..5138edb
--- /dev/null
+++ b/src/shared/volatile-util.c
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+
+#include "alloc-util.h"
+#include "macro.h"
+#include "parse-util.h"
+#include "proc-cmdline.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "volatile-util.h"
+
+int query_volatile_mode(VolatileMode *ret) {
+ _cleanup_free_ char *mode = NULL;
+ int r;
+
+ r = proc_cmdline_get_key("systemd.volatile", PROC_CMDLINE_VALUE_OPTIONAL, &mode);
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ *ret = VOLATILE_NO;
+ return 0;
+ }
+
+ if (mode) {
+ VolatileMode m;
+
+ m = volatile_mode_from_string(mode);
+ if (m < 0)
+ return m;
+
+ *ret = m;
+ } else
+ *ret = VOLATILE_YES;
+
+ return 1;
+}
+
+static const char* const volatile_mode_table[_VOLATILE_MODE_MAX] = {
+ [VOLATILE_NO] = "no",
+ [VOLATILE_YES] = "yes",
+ [VOLATILE_STATE] = "state",
+ [VOLATILE_OVERLAY] = "overlay",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(volatile_mode, VolatileMode, VOLATILE_YES);
diff --git a/src/shared/volatile-util.h b/src/shared/volatile-util.h
new file mode 100644
index 0000000..6e0206d
--- /dev/null
+++ b/src/shared/volatile-util.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+typedef enum VolatileMode {
+ VOLATILE_NO,
+ VOLATILE_YES,
+ VOLATILE_STATE,
+ VOLATILE_OVERLAY,
+ _VOLATILE_MODE_MAX,
+ _VOLATILE_MODE_INVALID = -EINVAL,
+} VolatileMode;
+
+VolatileMode volatile_mode_from_string(const char *s);
+const char* volatile_mode_to_string(VolatileMode m);
+
+int query_volatile_mode(VolatileMode *ret);
diff --git a/src/shared/wall.c b/src/shared/wall.c
new file mode 100644
index 0000000..d5900ef
--- /dev/null
+++ b/src/shared/wall.c
@@ -0,0 +1,187 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "sd-login.h"
+
+#include "errno-util.h"
+#include "fd-util.h"
+#include "hostname-util.h"
+#include "io-util.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "terminal-util.h"
+#include "user-util.h"
+#include "utmp-wtmp.h"
+#include "wall.h"
+
+#if ENABLE_UTMP || ENABLE_LOGIND
+
+#define TIMEOUT_USEC (50 * USEC_PER_MSEC)
+
+static int write_to_terminal(const char *tty, const char *message) {
+ _cleanup_close_ int fd = -EBADF;
+
+ assert(tty);
+ assert(message);
+
+ fd = open(tty, O_WRONLY|O_NONBLOCK|O_NOCTTY|O_CLOEXEC);
+ if (fd < 0)
+ return -errno;
+ if (!isatty(fd))
+ return -ENOTTY;
+
+ return loop_write_full(fd, message, SIZE_MAX, TIMEOUT_USEC);
+}
+
+static int wall_utmp(
+ const char *message,
+ bool (*match_tty)(const char *tty, bool is_local, void *userdata),
+ void *userdata) {
+
+#if ENABLE_UTMP
+ _unused_ _cleanup_(utxent_cleanup) bool utmpx = false;
+ struct utmpx *u;
+ int r = 0;
+
+ assert(message);
+
+ /* libc's setutxent() unfortunately doesn't inform us about success, i.e. whether /var/run/utmp
+ * exists. Hence we have to check manually first. */
+ if (access(_PATH_UTMPX, F_OK) < 0) {
+ if (errno == ENOENT)
+ return -ENOPROTOOPT;
+
+ return -errno;
+ }
+
+ utmpx = utxent_start();
+
+ while ((u = getutxent())) {
+ _cleanup_free_ char *p = NULL;
+ const char *tty_path;
+ bool is_local;
+
+ if (u->ut_type != USER_PROCESS || isempty(u->ut_user))
+ continue;
+
+ /* This access is fine, because strlen("/dev/") < 32 (UT_LINESIZE) */
+ if (path_startswith(u->ut_line, "/dev/"))
+ tty_path = u->ut_line;
+ else {
+ if (asprintf(&p, "/dev/%.*s", (int) sizeof(u->ut_line), u->ut_line) < 0)
+ return -ENOMEM;
+
+ tty_path = p;
+ }
+
+ /* It seems that the address field is always set for remote logins. For local logins and
+ * other local entries, we get [0,0,0,0]. */
+ is_local = eqzero(u->ut_addr_v6);
+
+ if (!match_tty || match_tty(tty_path, is_local, userdata))
+ RET_GATHER(r, write_to_terminal(tty_path, message));
+ }
+
+ return r;
+
+#else
+ return -ENOPROTOOPT;
+#endif
+}
+
+static int wall_logind(
+ const char *message,
+ bool (*match_tty)(const char *tty, bool is_local, void *userdata),
+ void *userdata) {
+
+#if ENABLE_LOGIND
+ _cleanup_strv_free_ char **sessions = NULL;
+ int r;
+
+ assert(message);
+
+ r = sd_get_sessions(&sessions);
+ if (r <= 0)
+ return r;
+
+ r = 0;
+
+ STRV_FOREACH(s, sessions) {
+ _cleanup_free_ char *tty_path = NULL, *tty = NULL, *rhost = NULL;
+ bool is_local;
+ int q;
+
+ q = sd_session_get_tty(*s, &tty);
+ if (IN_SET(q, -ENXIO, -ENODATA))
+ continue;
+ if (q < 0)
+ return RET_GATHER(r, q);
+
+ tty_path = strjoin("/dev/", tty);
+ if (!tty_path)
+ return -ENOMEM;
+
+ (void) sd_session_get_remote_host(*s, &rhost);
+ is_local = !rhost;
+
+ if (!match_tty || match_tty(tty_path, is_local, userdata))
+ RET_GATHER(r, write_to_terminal(tty_path, message));
+ }
+
+ return r;
+
+#else
+ return -ENOPROTOOPT;
+#endif
+}
+
+int wall(
+ const char *message,
+ const char *username,
+ const char *origin_tty,
+ bool (*match_tty)(const char *tty, bool is_local, void *userdata),
+ void *userdata) {
+
+ _cleanup_free_ char *text = NULL, *hostname = NULL, *username_alloc = NULL, *stdin_tty = NULL;
+ int r;
+
+ assert(message);
+
+ hostname = gethostname_malloc();
+ if (!hostname)
+ return -ENOMEM;
+
+ if (!username) {
+ username_alloc = getlogname_malloc();
+ if (!username_alloc)
+ return -ENOMEM;
+
+ username = username_alloc;
+ }
+
+ if (!origin_tty) {
+ (void) getttyname_harder(STDIN_FILENO, &stdin_tty);
+ origin_tty = stdin_tty;
+ }
+
+ if (asprintf(&text,
+ "\r\n"
+ "Broadcast message from %s@%s%s%s (%s):\r\n\r\n"
+ "%s\r\n\r\n",
+ username, hostname,
+ origin_tty ? " on " : "", strempty(origin_tty),
+ FORMAT_TIMESTAMP(now(CLOCK_REALTIME)),
+ message) < 0)
+ return -ENOMEM;
+
+ r = wall_utmp(text, match_tty, userdata);
+ if (r == -ENOPROTOOPT)
+ r = wall_logind(text, match_tty, userdata);
+
+ return r == -ENOPROTOOPT ? 0 : r;
+}
+
+#endif
diff --git a/src/shared/wall.h b/src/shared/wall.h
new file mode 100644
index 0000000..2964277
--- /dev/null
+++ b/src/shared/wall.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#if ENABLE_UTMP || ENABLE_LOGIND
+
+int wall(
+ const char *message,
+ const char *username,
+ const char *origin_tty,
+ bool (*match_tty)(const char *tty, bool is_local, void *userdata),
+ void *userdata);
+
+#else
+
+static inline int wall(
+ const char *message,
+ const char *username,
+ const char *origin_tty,
+ bool (*match_tty)(const char *tty, bool is_local, void *userdata),
+ void *userdata) {
+
+ return 0;
+}
+
+#endif
diff --git a/src/shared/watchdog.c b/src/shared/watchdog.c
new file mode 100644
index 0000000..2d79f71
--- /dev/null
+++ b/src/shared/watchdog.c
@@ -0,0 +1,504 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <syslog.h>
+#include <unistd.h>
+#include <linux/watchdog.h>
+
+#include "devnum-util.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "log.h"
+#include "path-util.h"
+#include "string-util.h"
+#include "time-util.h"
+#include "watchdog.h"
+
+static int watchdog_fd = -EBADF;
+static char *watchdog_device = NULL;
+static usec_t watchdog_timeout = 0; /* 0 → close device and USEC_INFINITY → don't change timeout */
+static usec_t watchdog_pretimeout = 0; /* 0 → disable pretimeout and USEC_INFINITY → don't change pretimeout */
+static usec_t watchdog_last_ping = USEC_INFINITY;
+static bool watchdog_supports_pretimeout = false; /* Depends on kernel state that might change at runtime */
+static char *watchdog_pretimeout_governor = NULL;
+
+/* Starting from kernel version 4.5, the maximum allowable watchdog timeout is
+ * UINT_MAX/1000U seconds (since internal calculations are done in milliseconds
+ * using unsigned integers. However, the kernel's userspace API for the watchdog
+ * uses signed integers for its ioctl parameters (even for timeout values and
+ * bit flags) so this is why we must consider the maximum signed integer value
+ * as well.
+ */
+#define WATCHDOG_TIMEOUT_MAX_SEC (CONST_MIN(UINT_MAX/1000U, (unsigned)INT_MAX))
+
+#define WATCHDOG_GOV_NAME_MAXLEN 20 /* From the kernel watchdog driver */
+
+static int saturated_usec_to_sec(usec_t val) {
+ usec_t t = DIV_ROUND_UP(val, USEC_PER_SEC);
+ return MIN(t, (usec_t) WATCHDOG_TIMEOUT_MAX_SEC); /* Saturate to watchdog max */
+}
+
+static int get_watchdog_sysfs_path(const char *filename, char **ret_path) {
+ struct stat st;
+
+ if (watchdog_fd < 0)
+ return -EBADF;
+
+ if (fstat(watchdog_fd, &st))
+ return -errno;
+
+ if (!S_ISCHR(st.st_mode))
+ return -EBADF;
+
+ if (asprintf(ret_path, "/sys/dev/char/"DEVNUM_FORMAT_STR"/%s", DEVNUM_FORMAT_VAL(st.st_rdev), filename) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int get_pretimeout_governor(char **ret_gov) {
+ _cleanup_free_ char *sys_fn = NULL;
+ int r;
+
+ r = get_watchdog_sysfs_path("pretimeout_governor", &sys_fn);
+ if (r < 0)
+ return r;
+
+ log_info("Watchdog: reading from %s", sys_fn);
+
+ r = read_virtual_file(sys_fn, WATCHDOG_GOV_NAME_MAXLEN - 1, ret_gov, NULL);
+ if (r < 0)
+ return r;
+
+ delete_trailing_chars(*ret_gov, WHITESPACE);
+
+ return 0;
+}
+
+static int set_pretimeout_governor(const char *governor) {
+ _cleanup_free_ char *sys_fn = NULL;
+ int r;
+
+ if (isempty(governor))
+ return 0; /* Nothing to do */
+
+ r = get_watchdog_sysfs_path("pretimeout_governor", &sys_fn);
+ if (r < 0)
+ return r;
+
+ log_info("Watchdog: setting pretimeout_governor to '%s' via '%s'", governor, sys_fn);
+
+ r = write_string_file(sys_fn,
+ governor,
+ WRITE_STRING_FILE_DISABLE_BUFFER | WRITE_STRING_FILE_VERIFY_ON_FAILURE | WRITE_STRING_FILE_VERIFY_IGNORE_NEWLINE);
+ if (r < 0)
+ return log_error_errno(r, "Failed to set pretimeout_governor to '%s': %m", governor);
+
+ return r;
+}
+
+static int watchdog_set_enable(bool enable) {
+ int flags = enable ? WDIOS_ENABLECARD : WDIOS_DISABLECARD;
+
+ assert(watchdog_fd >= 0);
+
+ if (ioctl(watchdog_fd, WDIOC_SETOPTIONS, &flags) < 0) {
+ if (!enable)
+ return log_warning_errno(errno, "Failed to disable hardware watchdog, ignoring: %m");
+
+ /* ENOTTY means the watchdog is always enabled so we're fine */
+ log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno,
+ "Failed to enable hardware watchdog, ignoring: %m");
+ if (!ERRNO_IS_NOT_SUPPORTED(errno))
+ return -errno;
+ }
+
+ return 0;
+}
+
+static int watchdog_read_timeout(void) {
+ int sec = 0;
+
+ assert(watchdog_fd >= 0);
+
+ if (ioctl(watchdog_fd, WDIOC_GETTIMEOUT, &sec) < 0)
+ return -errno;
+
+ assert(sec > 0);
+ watchdog_timeout = sec * USEC_PER_SEC;
+
+ return 0;
+}
+
+static int watchdog_set_timeout(void) {
+ int sec;
+
+ assert(watchdog_fd >= 0);
+ assert(timestamp_is_set(watchdog_timeout));
+
+ sec = saturated_usec_to_sec(watchdog_timeout);
+
+ if (ioctl(watchdog_fd, WDIOC_SETTIMEOUT, &sec) < 0)
+ return -errno;
+
+ assert(sec > 0); /* buggy driver ? */
+ watchdog_timeout = sec * USEC_PER_SEC;
+
+ return 0;
+}
+
+static int watchdog_read_pretimeout(void) {
+ int sec = 0;
+
+ assert(watchdog_fd >= 0);
+
+ if (ioctl(watchdog_fd, WDIOC_GETPRETIMEOUT, &sec) < 0) {
+ watchdog_pretimeout = 0;
+ return log_full_errno(ERRNO_IS_NOT_SUPPORTED(errno) ? LOG_DEBUG : LOG_WARNING, errno, "Failed to get pretimeout value, ignoring: %m");
+ }
+
+ watchdog_pretimeout = sec * USEC_PER_SEC;
+
+ return 0;
+}
+
+static int watchdog_set_pretimeout(void) {
+ int sec;
+
+ assert(watchdog_fd >= 0);
+ assert(watchdog_pretimeout != USEC_INFINITY);
+
+ sec = saturated_usec_to_sec(watchdog_pretimeout);
+
+ if (ioctl(watchdog_fd, WDIOC_SETPRETIMEOUT, &sec) < 0) {
+ watchdog_pretimeout = 0;
+
+ if (ERRNO_IS_NOT_SUPPORTED(errno)) {
+ log_info("Watchdog does not support pretimeouts.");
+ return 0;
+ }
+
+ return log_error_errno(errno, "Failed to set pretimeout to %s: %m", FORMAT_TIMESPAN(sec, USEC_PER_SEC));
+ }
+
+ /* The set ioctl does not return the actual value set so get it now. */
+ (void) watchdog_read_pretimeout();
+
+ return 0;
+}
+
+usec_t watchdog_get_last_ping(clockid_t clock) {
+ return map_clock_usec(watchdog_last_ping, CLOCK_BOOTTIME, clock);
+}
+
+static int watchdog_ping_now(void) {
+ assert(watchdog_fd >= 0);
+
+ if (ioctl(watchdog_fd, WDIOC_KEEPALIVE, 0) < 0)
+ return log_warning_errno(errno, "Failed to ping hardware watchdog, ignoring: %m");
+
+ watchdog_last_ping = now(CLOCK_BOOTTIME);
+
+ return 0;
+}
+
+static int update_pretimeout(void) {
+ _cleanup_free_ char *governor = NULL;
+ int r, t_sec, pt_sec;
+
+ if (watchdog_fd < 0)
+ return 0;
+
+ if (watchdog_timeout == USEC_INFINITY || watchdog_pretimeout == USEC_INFINITY)
+ return 0;
+
+ if (!watchdog_supports_pretimeout && watchdog_pretimeout == 0)
+ return 0; /* Nothing to do */
+
+ /* The configuration changed, do not assume it can still work, as the module(s)
+ * might have been unloaded. */
+ watchdog_supports_pretimeout = false;
+
+ /* Update the pretimeout governor as well */
+ (void) set_pretimeout_governor(watchdog_pretimeout_governor);
+
+ r = get_pretimeout_governor(&governor);
+ if (r < 0)
+ return log_warning_errno(r, "Watchdog: failed to read pretimeout governor: %m");
+ if (isempty(governor))
+ return log_warning_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Watchdog: no pretimeout governor detected - is the required kernel module loaded?");
+
+ /* If we have a pretimeout governor, then pretimeout is supported. Without a governor
+ * pretimeout does not work at all.
+ * Note that this might require a kernel module that is not autoloaded, so we don't
+ * cache this, but we check every time the configuration changes. */
+ watchdog_supports_pretimeout = true;
+
+ /* Determine if the pretimeout is valid for the current watchdog timeout. */
+ t_sec = saturated_usec_to_sec(watchdog_timeout);
+ pt_sec = saturated_usec_to_sec(watchdog_pretimeout);
+ if (pt_sec >= t_sec) {
+ r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Cannot set watchdog pretimeout to %is (%s watchdog timeout of %is)",
+ pt_sec, pt_sec == t_sec ? "same as" : "longer than", t_sec);
+ (void) watchdog_read_pretimeout();
+ } else
+ r = watchdog_set_pretimeout();
+
+ if (watchdog_pretimeout == 0)
+ log_info("Watchdog pretimeout is disabled.");
+ else
+ log_info("Watchdog running with a pretimeout of %s with governor '%s'.",
+ FORMAT_TIMESPAN(watchdog_pretimeout, 0),
+ governor);
+
+ return r;
+}
+
+static int update_timeout(void) {
+ int r;
+ usec_t previous_timeout;
+
+ assert(watchdog_timeout > 0);
+
+ if (watchdog_fd < 0)
+ return 0;
+
+ previous_timeout = watchdog_timeout;
+
+ if (watchdog_timeout != USEC_INFINITY) {
+ r = watchdog_set_timeout();
+ if (r < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(r))
+ return log_error_errno(r, "Failed to set timeout to %s: %m",
+ FORMAT_TIMESPAN(watchdog_timeout, 0));
+
+ log_info("Modifying watchdog timeout is not supported, reusing the programmed timeout.");
+ watchdog_timeout = USEC_INFINITY;
+ }
+ }
+
+ if (watchdog_timeout == USEC_INFINITY) {
+ r = watchdog_read_timeout();
+ if (r < 0) {
+ if (!ERRNO_IS_NOT_SUPPORTED(r))
+ return log_error_errno(r, "Failed to query watchdog HW timeout: %m");
+ log_info("Reading watchdog timeout is not supported, reusing the configured timeout.");
+ watchdog_timeout = previous_timeout;
+ }
+ }
+
+ /* If the watchdog timeout was changed, the pretimeout could have been
+ * changed as well by the driver or the kernel so we need to update the
+ * pretimeout now. Or if the watchdog is being configured for the first
+ * time, we want to configure the pretimeout before it is enabled. */
+ (void) update_pretimeout();
+
+ r = watchdog_set_enable(true);
+ if (r < 0)
+ return r;
+
+ log_info("Watchdog running with a timeout of %s.", FORMAT_TIMESPAN(watchdog_timeout, 0));
+
+ return watchdog_ping_now();
+}
+
+static int open_watchdog(void) {
+ struct watchdog_info ident;
+ char **try_order;
+ int r;
+
+ if (watchdog_fd >= 0)
+ return 0;
+
+ /* Let's prefer new-style /dev/watchdog0 (i.e. kernel 3.5+) over classic /dev/watchdog. The former
+ * has the benefit that we can easily find the matching directory in sysfs from it, as the relevant
+ * sysfs attributes can only be found via /sys/dev/char/<major>:<minor> if the new-style device
+ * major/minor is used, not the old-style. */
+ try_order = !watchdog_device || PATH_IN_SET(watchdog_device, "/dev/watchdog", "/dev/watchdog0") ?
+ STRV_MAKE("/dev/watchdog0", "/dev/watchdog") : STRV_MAKE(watchdog_device);
+
+ STRV_FOREACH(wd, try_order) {
+ watchdog_fd = open(*wd, O_WRONLY|O_CLOEXEC);
+ if (watchdog_fd >= 0) {
+ if (free_and_strdup(&watchdog_device, *wd) < 0) {
+ r = log_oom_debug();
+ goto close_and_fail;
+ }
+
+ break;
+ }
+
+ if (errno != ENOENT)
+ return log_debug_errno(errno, "Failed to open watchdog device %s: %m", *wd);
+ }
+
+ if (watchdog_fd < 0)
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to open watchdog device %s: %m", watchdog_device ?: "auto");
+
+ if (ioctl(watchdog_fd, WDIOC_GETSUPPORT, &ident) < 0)
+ log_debug_errno(errno, "Hardware watchdog %s does not support WDIOC_GETSUPPORT ioctl, ignoring: %m", watchdog_device);
+ else
+ log_info("Using hardware watchdog '%s', version %x, device %s",
+ ident.identity,
+ ident.firmware_version,
+ watchdog_device);
+
+ r = update_timeout();
+ if (r < 0)
+ goto close_and_fail;
+
+ return 0;
+
+close_and_fail:
+ watchdog_close(/* disarm= */ true);
+ return r;
+}
+
+const char *watchdog_get_device(void) {
+ return watchdog_device;
+}
+
+int watchdog_set_device(const char *path) {
+ int r;
+
+ r = free_and_strdup(&watchdog_device, path);
+ if (r > 0) /* watchdog_device changed */
+ watchdog_close(/* disarm= */ true);
+
+ return r;
+}
+
+int watchdog_setup(usec_t timeout) {
+ usec_t previous_timeout;
+ int r;
+
+ /* timeout=0 closes the device whereas passing timeout=USEC_INFINITY opens it (if needed)
+ * without configuring any particular timeout and thus reuses the programmed value (therefore
+ * it's a nop if the device is already opened). */
+
+ if (timeout == 0) {
+ watchdog_close(true);
+ return 0;
+ }
+
+ /* Let's shortcut duplicated requests */
+ if (watchdog_fd >= 0 && (timeout == watchdog_timeout || timeout == USEC_INFINITY))
+ return 0;
+
+ /* Initialize the watchdog timeout with the caller value. This value is going to be updated by
+ * update_timeout() with the closest value supported by the driver */
+ previous_timeout = watchdog_timeout;
+ watchdog_timeout = timeout;
+
+ if (watchdog_fd < 0)
+ return open_watchdog();
+
+ r = update_timeout();
+ if (r < 0)
+ watchdog_timeout = previous_timeout;
+
+ return r;
+}
+
+int watchdog_setup_pretimeout(usec_t timeout) {
+ /* timeout=0 disables the pretimeout whereas timeout=USEC_INFINITY is a nop. */
+ if ((watchdog_fd >= 0 && timeout == watchdog_pretimeout) || timeout == USEC_INFINITY)
+ return 0;
+
+ /* Initialize the watchdog timeout with the caller value. This value is
+ * going to be updated by update_pretimeout() with the running value,
+ * even if it fails to update the timeout. */
+ watchdog_pretimeout = timeout;
+
+ return update_pretimeout();
+}
+
+int watchdog_setup_pretimeout_governor(const char *governor) {
+ if (free_and_strdup(&watchdog_pretimeout_governor, governor) < 0)
+ return -ENOMEM;
+
+ return set_pretimeout_governor(watchdog_pretimeout_governor);
+}
+
+static usec_t calc_timeout(void) {
+ /* Calculate the effective timeout which accounts for the watchdog
+ * pretimeout if configured and supported. */
+ if (watchdog_supports_pretimeout && timestamp_is_set(watchdog_pretimeout) && watchdog_timeout >= watchdog_pretimeout)
+ return watchdog_timeout - watchdog_pretimeout;
+ else
+ return watchdog_timeout;
+}
+
+usec_t watchdog_runtime_wait(void) {
+ usec_t timeout = calc_timeout();
+ if (!timestamp_is_set(timeout))
+ return USEC_INFINITY;
+
+ /* Sleep half the watchdog timeout since the last successful ping at most */
+ if (timestamp_is_set(watchdog_last_ping)) {
+ usec_t ntime = now(CLOCK_BOOTTIME);
+
+ assert(ntime >= watchdog_last_ping);
+ return usec_sub_unsigned(watchdog_last_ping + (timeout / 2), ntime);
+ }
+
+ return timeout / 2;
+}
+
+int watchdog_ping(void) {
+ usec_t ntime, timeout;
+
+ if (watchdog_timeout == 0)
+ return 0;
+
+ if (watchdog_fd < 0)
+ /* open_watchdog() will automatically ping the device for us if necessary */
+ return open_watchdog();
+
+ ntime = now(CLOCK_BOOTTIME);
+ timeout = calc_timeout();
+
+ /* Never ping earlier than watchdog_timeout/4 and try to ping
+ * by watchdog_timeout/2 plus scheduling latencies at the latest */
+ if (timestamp_is_set(watchdog_last_ping)) {
+ assert(ntime >= watchdog_last_ping);
+ if ((ntime - watchdog_last_ping) < (timeout / 4))
+ return 0;
+ }
+
+ return watchdog_ping_now();
+}
+
+void watchdog_close(bool disarm) {
+
+ /* Once closed, pinging the device becomes a NOP and we request a new
+ * call to watchdog_setup() to open the device again. */
+ watchdog_timeout = 0;
+
+ if (watchdog_fd < 0)
+ return;
+
+ if (disarm) {
+ (void) watchdog_set_enable(false);
+
+ /* To be sure, use magic close logic, too */
+ for (;;) {
+ static const char v = 'V';
+
+ if (write(watchdog_fd, &v, 1) > 0)
+ break;
+
+ if (errno != EINTR) {
+ log_warning_errno(errno, "Failed to disarm watchdog timer, ignoring: %m");
+ break;
+ }
+ }
+ }
+
+ watchdog_fd = safe_close(watchdog_fd);
+}
diff --git a/src/shared/watchdog.h b/src/shared/watchdog.h
new file mode 100644
index 0000000..a490183
--- /dev/null
+++ b/src/shared/watchdog.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "time-util.h"
+
+const char *watchdog_get_device(void);
+usec_t watchdog_get_last_ping(clockid_t clock);
+
+int watchdog_set_device(const char *path);
+int watchdog_setup(usec_t timeout);
+int watchdog_setup_pretimeout(usec_t usec);
+int watchdog_setup_pretimeout_governor(const char *governor);
+int watchdog_ping(void);
+void watchdog_close(bool disarm);
+usec_t watchdog_runtime_wait(void);
+
+static inline void watchdog_free_device(void) {
+ (void) watchdog_set_device(NULL);
+}
diff --git a/src/shared/web-util.c b/src/shared/web-util.c
new file mode 100644
index 0000000..39a300f
--- /dev/null
+++ b/src/shared/web-util.c
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <stdbool.h>
+
+#include "string-util.h"
+#include "strv.h"
+#include "utf8.h"
+#include "web-util.h"
+
+bool http_etag_is_valid(const char *etag) {
+ if (isempty(etag))
+ return false;
+
+ if (!endswith(etag, "\""))
+ return false;
+
+ if (!STARTSWITH_SET(etag, "\"", "W/\""))
+ return false;
+
+ return true;
+}
+
+bool http_url_is_valid(const char *url) {
+ const char *p;
+
+ if (isempty(url))
+ return false;
+
+ p = STARTSWITH_SET(url, "http://", "https://");
+ if (!p)
+ return false;
+
+ if (isempty(p))
+ return false;
+
+ return ascii_is_valid(p);
+}
+
+bool file_url_is_valid(const char *url) {
+ const char *p;
+
+ if (isempty(url))
+ return false;
+
+ p = startswith(url, "file:/");
+ if (isempty(p))
+ return false;
+
+ return ascii_is_valid(p);
+}
+
+bool documentation_url_is_valid(const char *url) {
+ const char *p;
+
+ if (isempty(url))
+ return false;
+
+ if (http_url_is_valid(url) || file_url_is_valid(url))
+ return true;
+
+ p = STARTSWITH_SET(url, "info:", "man:");
+ if (isempty(p))
+ return false;
+
+ return ascii_is_valid(p);
+}
diff --git a/src/shared/web-util.h b/src/shared/web-util.h
new file mode 100644
index 0000000..88b4897
--- /dev/null
+++ b/src/shared/web-util.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+
+#include "macro.h"
+
+bool http_url_is_valid(const char *url) _pure_;
+bool file_url_is_valid(const char *url) _pure_;
+
+bool documentation_url_is_valid(const char *url) _pure_;
+
+bool http_etag_is_valid(const char *etag);
diff --git a/src/shared/wifi-util.c b/src/shared/wifi-util.c
new file mode 100644
index 0000000..d4e6dca
--- /dev/null
+++ b/src/shared/wifi-util.c
@@ -0,0 +1,306 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "log.h"
+#include "string-table.h"
+#include "string-util.h"
+#include "wifi-util.h"
+
+int wifi_get_interface(sd_netlink *genl, int ifindex, enum nl80211_iftype *ret_iftype, char **ret_ssid) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL;
+ _cleanup_free_ char *ssid = NULL;
+ const char *family;
+ uint32_t iftype;
+ size_t len;
+ int r;
+
+ assert(genl);
+ assert(ifindex > 0);
+
+ r = sd_genl_message_new(genl, NL80211_GENL_NAME, NL80211_CMD_GET_INTERFACE, &m);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create generic netlink message: %m");
+
+ r = sd_netlink_message_append_u32(m, NL80211_ATTR_IFINDEX, ifindex);
+ if (r < 0)
+ return log_debug_errno(r, "Could not append NL80211_ATTR_IFINDEX attribute: %m");
+
+ r = sd_netlink_call(genl, m, 0, &reply);
+ if (r == -ENODEV) {
+ /* For obsolete WEXT driver. */
+ log_debug_errno(r, "Failed to request information about wifi interface %d. "
+ "The device doesn't seem to have nl80211 interface. Ignoring.",
+ ifindex);
+ goto nodata;
+ }
+ if (r < 0)
+ return log_debug_errno(r, "Failed to request information about wifi interface %d: %m", ifindex);
+ if (!reply) {
+ log_debug("No reply received to request for information about wifi interface %d, ignoring.", ifindex);
+ goto nodata;
+ }
+
+ r = sd_netlink_message_get_errno(reply);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get information about wifi interface %d: %m", ifindex);
+
+ r = sd_genl_message_get_family_name(genl, reply, &family);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine genl family: %m");
+ if (!streq(family, NL80211_GENL_NAME)) {
+ log_debug("Received message of unexpected genl family '%s', ignoring.", family);
+ goto nodata;
+ }
+
+ r = sd_netlink_message_read_u32(reply, NL80211_ATTR_IFTYPE, &iftype);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get NL80211_ATTR_IFTYPE attribute: %m");
+
+ r = sd_netlink_message_read_data_suffix0(reply, NL80211_ATTR_SSID, &len, (void**) &ssid);
+ if (r < 0 && r != -ENODATA)
+ return log_debug_errno(r, "Failed to get NL80211_ATTR_SSID attribute: %m");
+ if (r >= 0) {
+ if (len == 0) {
+ log_debug("SSID has zero length, ignoring it.");
+ ssid = mfree(ssid);
+ } else if (strlen_ptr(ssid) != len) {
+ log_debug("SSID contains NUL characters, ignoring it.");
+ ssid = mfree(ssid);
+ }
+ }
+
+ if (ret_iftype)
+ *ret_iftype = iftype;
+
+ if (ret_ssid)
+ *ret_ssid = TAKE_PTR(ssid);
+
+ return 1;
+
+nodata:
+ if (ret_iftype)
+ *ret_iftype = 0;
+ if (ret_ssid)
+ *ret_ssid = NULL;
+ return 0;
+}
+
+int wifi_get_station(sd_netlink *genl, int ifindex, struct ether_addr *ret_bssid) {
+ _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL, *reply = NULL;
+ const char *family;
+ int r;
+
+ assert(genl);
+ assert(ifindex > 0);
+ assert(ret_bssid);
+
+ r = sd_genl_message_new(genl, NL80211_GENL_NAME, NL80211_CMD_GET_STATION, &m);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to create generic netlink message: %m");
+
+ r = sd_netlink_message_set_flags(m, NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to set dump flag: %m");
+
+ r = sd_netlink_message_append_u32(m, NL80211_ATTR_IFINDEX, ifindex);
+ if (r < 0)
+ return log_debug_errno(r, "Could not append NL80211_ATTR_IFINDEX attribute: %m");
+
+ r = sd_netlink_call(genl, m, 0, &reply);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to request information about wifi station: %m");
+ if (!reply) {
+ log_debug("No reply received to request for information about wifi station, ignoring.");
+ goto nodata;
+ }
+
+ r = sd_netlink_message_get_errno(reply);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get information about wifi station: %m");
+
+ r = sd_genl_message_get_family_name(genl, reply, &family);
+ if (r < 0)
+ return log_debug_errno(r, "Failed to determine genl family: %m");
+ if (!streq(family, NL80211_GENL_NAME)) {
+ log_debug("Received message of unexpected genl family '%s', ignoring.", family);
+ goto nodata;
+ }
+
+ r = sd_netlink_message_read_ether_addr(reply, NL80211_ATTR_MAC, ret_bssid);
+ if (r == -ENODATA)
+ goto nodata;
+ if (r < 0)
+ return log_debug_errno(r, "Failed to get NL80211_ATTR_MAC attribute: %m");
+
+ return 1;
+
+nodata:
+ *ret_bssid = ETHER_ADDR_NULL;
+ return 0;
+}
+
+static const char * const nl80211_iftype_table[NUM_NL80211_IFTYPES] = {
+ [NL80211_IFTYPE_ADHOC] = "ad-hoc",
+ [NL80211_IFTYPE_STATION] = "station",
+ [NL80211_IFTYPE_AP] = "ap",
+ [NL80211_IFTYPE_AP_VLAN] = "ap-vlan",
+ [NL80211_IFTYPE_WDS] = "wds",
+ [NL80211_IFTYPE_MONITOR] = "monitor",
+ [NL80211_IFTYPE_MESH_POINT] = "mesh-point",
+ [NL80211_IFTYPE_P2P_CLIENT] = "p2p-client",
+ [NL80211_IFTYPE_P2P_GO] = "p2p-go",
+ [NL80211_IFTYPE_P2P_DEVICE] = "p2p-device",
+ [NL80211_IFTYPE_OCB] = "ocb",
+ [NL80211_IFTYPE_NAN] = "nan",
+};
+
+DEFINE_STRING_TABLE_LOOKUP(nl80211_iftype, enum nl80211_iftype);
+
+static const char * const nl80211_cmd_table[__NL80211_CMD_AFTER_LAST] = {
+ [NL80211_CMD_GET_WIPHY] = "get_wiphy",
+ [NL80211_CMD_SET_WIPHY] = "set_wiphy",
+ [NL80211_CMD_NEW_WIPHY] = "new_wiphy",
+ [NL80211_CMD_DEL_WIPHY] = "del_wiphy",
+ [NL80211_CMD_GET_INTERFACE] = "get_interface",
+ [NL80211_CMD_SET_INTERFACE] = "set_interface",
+ [NL80211_CMD_NEW_INTERFACE] = "new_interface",
+ [NL80211_CMD_DEL_INTERFACE] = "del_interface",
+ [NL80211_CMD_GET_KEY] = "get_key",
+ [NL80211_CMD_SET_KEY] = "set_key",
+ [NL80211_CMD_NEW_KEY] = "new_key",
+ [NL80211_CMD_DEL_KEY] = "del_key",
+ [NL80211_CMD_GET_BEACON] = "get_beacon",
+ [NL80211_CMD_SET_BEACON] = "set_beacon",
+ [NL80211_CMD_START_AP] = "start_ap",
+ [NL80211_CMD_STOP_AP] = "stop_ap",
+ [NL80211_CMD_GET_STATION] = "get_station",
+ [NL80211_CMD_SET_STATION] = "set_station",
+ [NL80211_CMD_NEW_STATION] = "new_station",
+ [NL80211_CMD_DEL_STATION] = "del_station",
+ [NL80211_CMD_GET_MPATH] = "get_mpath",
+ [NL80211_CMD_SET_MPATH] = "set_mpath",
+ [NL80211_CMD_NEW_MPATH] = "new_mpath",
+ [NL80211_CMD_DEL_MPATH] = "del_mpath",
+ [NL80211_CMD_SET_BSS] = "set_bss",
+ [NL80211_CMD_SET_REG] = "set_reg",
+ [NL80211_CMD_REQ_SET_REG] = "req_set_reg",
+ [NL80211_CMD_GET_MESH_CONFIG] = "get_mesh_config",
+ [NL80211_CMD_SET_MESH_CONFIG] = "set_mesh_config",
+ [NL80211_CMD_SET_MGMT_EXTRA_IE] = "set_mgmt_extra_ie",
+ [NL80211_CMD_GET_REG] = "get_reg",
+ [NL80211_CMD_GET_SCAN] = "get_scan",
+ [NL80211_CMD_TRIGGER_SCAN] = "trigger_scan",
+ [NL80211_CMD_NEW_SCAN_RESULTS] = "new_scan_results",
+ [NL80211_CMD_SCAN_ABORTED] = "scan_aborted",
+ [NL80211_CMD_REG_CHANGE] = "reg_change",
+ [NL80211_CMD_AUTHENTICATE] = "authenticate",
+ [NL80211_CMD_ASSOCIATE] = "associate",
+ [NL80211_CMD_DEAUTHENTICATE] = "deauthenticate",
+ [NL80211_CMD_DISASSOCIATE] = "disassociate",
+ [NL80211_CMD_MICHAEL_MIC_FAILURE] = "michael_mic_failure",
+ [NL80211_CMD_REG_BEACON_HINT] = "reg_beacon_hint",
+ [NL80211_CMD_JOIN_IBSS] = "join_ibss",
+ [NL80211_CMD_LEAVE_IBSS] = "leave_ibss",
+ [NL80211_CMD_TESTMODE] = "testmode",
+ [NL80211_CMD_CONNECT] = "connect",
+ [NL80211_CMD_ROAM] = "roam",
+ [NL80211_CMD_DISCONNECT] = "disconnect",
+ [NL80211_CMD_SET_WIPHY_NETNS] = "set_wiphy_netns",
+ [NL80211_CMD_GET_SURVEY] = "get_survey",
+ [NL80211_CMD_NEW_SURVEY_RESULTS] = "new_survey_results",
+ [NL80211_CMD_SET_PMKSA] = "set_pmksa",
+ [NL80211_CMD_DEL_PMKSA] = "del_pmksa",
+ [NL80211_CMD_FLUSH_PMKSA] = "flush_pmksa",
+ [NL80211_CMD_REMAIN_ON_CHANNEL] = "remain_on_channel",
+ [NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL] = "cancel_remain_on_channel",
+ [NL80211_CMD_SET_TX_BITRATE_MASK] = "set_tx_bitrate_mask",
+ [NL80211_CMD_REGISTER_FRAME] = "register_frame",
+ [NL80211_CMD_FRAME] = "frame",
+ [NL80211_CMD_FRAME_TX_STATUS] = "frame_tx_status",
+ [NL80211_CMD_SET_POWER_SAVE] = "set_power_save",
+ [NL80211_CMD_GET_POWER_SAVE] = "get_power_save",
+ [NL80211_CMD_SET_CQM] = "set_cqm",
+ [NL80211_CMD_NOTIFY_CQM] = "notify_cqm",
+ [NL80211_CMD_SET_CHANNEL] = "set_channel",
+ [NL80211_CMD_SET_WDS_PEER] = "set_wds_peer",
+ [NL80211_CMD_FRAME_WAIT_CANCEL] = "frame_wait_cancel",
+ [NL80211_CMD_JOIN_MESH] = "join_mesh",
+ [NL80211_CMD_LEAVE_MESH] = "leave_mesh",
+ [NL80211_CMD_UNPROT_DEAUTHENTICATE] = "unprot_deauthenticate",
+ [NL80211_CMD_UNPROT_DISASSOCIATE] = "unprot_disassociate",
+ [NL80211_CMD_NEW_PEER_CANDIDATE] = "new_peer_candidate",
+ [NL80211_CMD_GET_WOWLAN] = "get_wowlan",
+ [NL80211_CMD_SET_WOWLAN] = "set_wowlan",
+ [NL80211_CMD_START_SCHED_SCAN] = "start_sched_scan",
+ [NL80211_CMD_STOP_SCHED_SCAN] = "stop_sched_scan",
+ [NL80211_CMD_SCHED_SCAN_RESULTS] = "sched_scan_results",
+ [NL80211_CMD_SCHED_SCAN_STOPPED] = "sched_scan_stopped",
+ [NL80211_CMD_SET_REKEY_OFFLOAD] = "set_rekey_offload",
+ [NL80211_CMD_PMKSA_CANDIDATE] = "pmksa_candidate",
+ [NL80211_CMD_TDLS_OPER] = "tdls_oper",
+ [NL80211_CMD_TDLS_MGMT] = "tdls_mgmt",
+ [NL80211_CMD_UNEXPECTED_FRAME] = "unexpected_frame",
+ [NL80211_CMD_PROBE_CLIENT] = "probe_client",
+ [NL80211_CMD_REGISTER_BEACONS] = "register_beacons",
+ [NL80211_CMD_UNEXPECTED_4ADDR_FRAME] = "unexpected_4addr_frame",
+ [NL80211_CMD_SET_NOACK_MAP] = "set_noack_map",
+ [NL80211_CMD_CH_SWITCH_NOTIFY] = "ch_switch_notify",
+ [NL80211_CMD_START_P2P_DEVICE] = "start_p2p_device",
+ [NL80211_CMD_STOP_P2P_DEVICE] = "stop_p2p_device",
+ [NL80211_CMD_CONN_FAILED] = "conn_failed",
+ [NL80211_CMD_SET_MCAST_RATE] = "set_mcast_rate",
+ [NL80211_CMD_SET_MAC_ACL] = "set_mac_acl",
+ [NL80211_CMD_RADAR_DETECT] = "radar_detect",
+ [NL80211_CMD_GET_PROTOCOL_FEATURES] = "get_protocol_features",
+ [NL80211_CMD_UPDATE_FT_IES] = "update_ft_ies",
+ [NL80211_CMD_FT_EVENT] = "ft_event",
+ [NL80211_CMD_CRIT_PROTOCOL_START] = "crit_protocol_start",
+ [NL80211_CMD_CRIT_PROTOCOL_STOP] = "crit_protocol_stop",
+ [NL80211_CMD_GET_COALESCE] = "get_coalesce",
+ [NL80211_CMD_SET_COALESCE] = "set_coalesce",
+ [NL80211_CMD_CHANNEL_SWITCH] = "channel_switch",
+ [NL80211_CMD_VENDOR] = "vendor",
+ [NL80211_CMD_SET_QOS_MAP] = "set_qos_map",
+ [NL80211_CMD_ADD_TX_TS] = "add_tx_ts",
+ [NL80211_CMD_DEL_TX_TS] = "del_tx_ts",
+ [NL80211_CMD_GET_MPP] = "get_mpp",
+ [NL80211_CMD_JOIN_OCB] = "join_ocb",
+ [NL80211_CMD_LEAVE_OCB] = "leave_ocb",
+ [NL80211_CMD_CH_SWITCH_STARTED_NOTIFY] = "ch_switch_started_notify",
+ [NL80211_CMD_TDLS_CHANNEL_SWITCH] = "tdls_channel_switch",
+ [NL80211_CMD_TDLS_CANCEL_CHANNEL_SWITCH] = "tdls_cancel_channel_switch",
+ [NL80211_CMD_WIPHY_REG_CHANGE] = "wiphy_reg_change",
+ [NL80211_CMD_ABORT_SCAN] = "abort_scan",
+ [NL80211_CMD_START_NAN] = "start_nan",
+ [NL80211_CMD_STOP_NAN] = "stop_nan",
+ [NL80211_CMD_ADD_NAN_FUNCTION] = "add_nan_function",
+ [NL80211_CMD_DEL_NAN_FUNCTION] = "del_nan_function",
+ [NL80211_CMD_CHANGE_NAN_CONFIG] = "change_nan_config",
+ [NL80211_CMD_NAN_MATCH] = "nan_match",
+ [NL80211_CMD_SET_MULTICAST_TO_UNICAST] = "set_multicast_to_unicast",
+ [NL80211_CMD_UPDATE_CONNECT_PARAMS] = "update_connect_params",
+ [NL80211_CMD_SET_PMK] = "set_pmk",
+ [NL80211_CMD_DEL_PMK] = "del_pmk",
+ [NL80211_CMD_PORT_AUTHORIZED] = "port_authorized",
+ [NL80211_CMD_RELOAD_REGDB] = "reload_regdb",
+ [NL80211_CMD_EXTERNAL_AUTH] = "external_auth",
+ [NL80211_CMD_STA_OPMODE_CHANGED] = "sta_opmode_changed",
+ [NL80211_CMD_CONTROL_PORT_FRAME] = "control_port_frame",
+ [NL80211_CMD_GET_FTM_RESPONDER_STATS] = "get_ftm_responder_stats",
+ [NL80211_CMD_PEER_MEASUREMENT_START] = "peer_measurement_start",
+ [NL80211_CMD_PEER_MEASUREMENT_RESULT] = "peer_measurement_result",
+ [NL80211_CMD_PEER_MEASUREMENT_COMPLETE] = "peer_measurement_complete",
+ [NL80211_CMD_NOTIFY_RADAR] = "notify_radar",
+ [NL80211_CMD_UPDATE_OWE_INFO] = "update_owe_info",
+ [NL80211_CMD_PROBE_MESH_LINK] = "probe_mesh_link",
+ [NL80211_CMD_SET_TID_CONFIG] = "set_tid_config",
+ [NL80211_CMD_UNPROT_BEACON] = "unprot_beacon",
+ [NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS] = "control_port_frame_tx_status",
+ [NL80211_CMD_SET_SAR_SPECS] = "set_sar_specs",
+ [NL80211_CMD_OBSS_COLOR_COLLISION] = "obss_color_collision",
+ [NL80211_CMD_COLOR_CHANGE_REQUEST] = "color_change_request",
+ [NL80211_CMD_COLOR_CHANGE_STARTED] = "color_change_started",
+ [NL80211_CMD_COLOR_CHANGE_ABORTED] = "color_change_aborted",
+ [NL80211_CMD_COLOR_CHANGE_COMPLETED] = "color_change_completed",
+};
+
+DEFINE_STRING_TABLE_LOOKUP_TO_STRING(nl80211_cmd, int);
diff --git a/src/shared/wifi-util.h b/src/shared/wifi-util.h
new file mode 100644
index 0000000..a762fbc
--- /dev/null
+++ b/src/shared/wifi-util.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#pragma once
+
+#include <linux/nl80211.h>
+
+#include "sd-netlink.h"
+
+#include "ether-addr-util.h"
+
+int wifi_get_interface(sd_netlink *genl, int ifindex, enum nl80211_iftype *ret_iftype, char **ret_ssid);
+int wifi_get_station(sd_netlink *genl, int ifindex, struct ether_addr *ret_bssid);
+
+const char *nl80211_iftype_to_string(enum nl80211_iftype iftype) _const_;
+enum nl80211_iftype nl80211_iftype_from_string(const char *s) _pure_;
+const char *nl80211_cmd_to_string(int cmd) _const_;
diff --git a/src/shared/xml.c b/src/shared/xml.c
new file mode 100644
index 0000000..3b1fb41
--- /dev/null
+++ b/src/shared/xml.c
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <errno.h>
+#include <stddef.h>
+
+#include "macro.h"
+#include "string-util.h"
+#include "xml.h"
+
+enum {
+ STATE_NULL,
+ STATE_TEXT,
+ STATE_TAG,
+ STATE_ATTRIBUTE,
+};
+
+static void inc_lines(unsigned *line, const char *s, size_t n) {
+ const char *p = s;
+
+ if (!line)
+ return;
+
+ for (;;) {
+ const char *f;
+
+ f = memchr(p, '\n', n);
+ if (!f)
+ return;
+
+ n -= (f - p) + 1;
+ p = f + 1;
+ (*line)++;
+ }
+}
+
+/* We don't actually do real XML here. We only read a simplistic
+ * subset, that is a bit less strict that XML and lacks all the more
+ * complex features, like entities, or namespaces. However, we do
+ * support some HTML5-like simplifications */
+
+int xml_tokenize(const char **p, char **name, void **state, unsigned *line) {
+ const char *c, *e, *b;
+ char *ret;
+ int t;
+
+ assert(p);
+ assert(*p);
+ assert(name);
+ assert(state);
+
+ t = PTR_TO_INT(*state);
+ c = *p;
+
+ if (t == STATE_NULL) {
+ if (line)
+ *line = 1;
+ t = STATE_TEXT;
+ }
+
+ for (;;) {
+ if (*c == 0)
+ return XML_END;
+
+ switch (t) {
+
+ case STATE_TEXT: {
+ int x;
+
+ e = strchrnul(c, '<');
+ if (e > c) {
+ /* More text... */
+ ret = strndup(c, e - c);
+ if (!ret)
+ return -ENOMEM;
+
+ inc_lines(line, c, e - c);
+
+ *name = ret;
+ *p = e;
+ *state = INT_TO_PTR(STATE_TEXT);
+
+ return XML_TEXT;
+ }
+
+ assert(*e == '<');
+ b = c + 1;
+
+ if (startswith(b, "!--")) {
+ /* A comment */
+ e = strstrafter(b + 3, "-->");
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, b, e - b);
+
+ c = e;
+ continue;
+ }
+
+ if (*b == '?') {
+ /* Processing instruction */
+
+ e = strstrafter(b + 1, "?>");
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, b, e - b);
+
+ c = e;
+ continue;
+ }
+
+ if (*b == '!') {
+ /* DTD */
+
+ e = strchr(b + 1, '>');
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, b, e + 1 - b);
+
+ c = e + 1;
+ continue;
+ }
+
+ if (*b == '/') {
+ /* A closing tag */
+ x = XML_TAG_CLOSE;
+ b++;
+ } else
+ x = XML_TAG_OPEN;
+
+ e = strpbrk(b, WHITESPACE "/>");
+ if (!e)
+ return -EINVAL;
+
+ ret = strndup(b, e - b);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = e;
+ *state = INT_TO_PTR(STATE_TAG);
+
+ return x;
+ }
+
+ case STATE_TAG:
+
+ b = c + strspn(c, WHITESPACE);
+ if (*b == 0)
+ return -EINVAL;
+
+ inc_lines(line, c, b - c);
+
+ e = b + strcspn(b, WHITESPACE "=/>");
+ if (e > b) {
+ /* An attribute */
+
+ ret = strndup(b, e - b);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = e;
+ *state = INT_TO_PTR(STATE_ATTRIBUTE);
+
+ return XML_ATTRIBUTE_NAME;
+ }
+
+ if (startswith(b, "/>")) {
+ /* An empty tag */
+
+ *name = NULL; /* For empty tags we return a NULL name, the caller must be prepared for that */
+ *p = b + 2;
+ *state = INT_TO_PTR(STATE_TEXT);
+
+ return XML_TAG_CLOSE_EMPTY;
+ }
+
+ if (*b != '>')
+ return -EINVAL;
+
+ c = b + 1;
+ t = STATE_TEXT;
+ continue;
+
+ case STATE_ATTRIBUTE:
+
+ if (*c == '=') {
+ c++;
+
+ if (IN_SET(*c, '\'', '"')) {
+ /* Tag with a quoted value */
+
+ e = strchr(c+1, *c);
+ if (!e)
+ return -EINVAL;
+
+ inc_lines(line, c, e - c);
+
+ ret = strndup(c+1, e - c - 1);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = e + 1;
+ *state = INT_TO_PTR(STATE_TAG);
+
+ return XML_ATTRIBUTE_VALUE;
+
+ }
+
+ /* Tag with a value without quotes */
+
+ b = strpbrk(c, WHITESPACE ">");
+ if (!b)
+ b = c;
+
+ ret = strndup(c, b - c);
+ if (!ret)
+ return -ENOMEM;
+
+ *name = ret;
+ *p = b;
+ *state = INT_TO_PTR(STATE_TAG);
+ return XML_ATTRIBUTE_VALUE;
+ }
+
+ t = STATE_TAG;
+ continue;
+ }
+
+ }
+
+ assert_not_reached();
+}
diff --git a/src/shared/xml.h b/src/shared/xml.h
new file mode 100644
index 0000000..217b3b0
--- /dev/null
+++ b/src/shared/xml.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+enum {
+ XML_END,
+ XML_TEXT,
+ XML_TAG_OPEN,
+ XML_TAG_CLOSE,
+ XML_TAG_CLOSE_EMPTY,
+ XML_ATTRIBUTE_NAME,
+ XML_ATTRIBUTE_VALUE,
+};
+
+int xml_tokenize(const char **p, char **name, void **state, unsigned *line);