summaryrefslogtreecommitdiffstats
path: root/src/vmspawn
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:49:52 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:49:52 +0000
commit55944e5e40b1be2afc4855d8d2baf4b73d1876b5 (patch)
tree33f869f55a1b149e9b7c2b7e201867ca5dd52992 /src/vmspawn
parentInitial commit. (diff)
downloadsystemd-55944e5e40b1be2afc4855d8d2baf4b73d1876b5.tar.xz
systemd-55944e5e40b1be2afc4855d8d2baf4b73d1876b5.zip
Adding upstream version 255.4.upstream/255.4
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/vmspawn')
-rw-r--r--src/vmspawn/meson.build27
-rw-r--r--src/vmspawn/vmspawn-settings.c3
-rw-r--r--src/vmspawn/vmspawn-settings.h11
-rw-r--r--src/vmspawn/vmspawn-util.c344
-rw-r--r--src/vmspawn/vmspawn-util.h26
-rw-r--r--src/vmspawn/vmspawn.c766
6 files changed, 1177 insertions, 0 deletions
diff --git a/src/vmspawn/meson.build b/src/vmspawn/meson.build
new file mode 100644
index 0000000..800d7c3
--- /dev/null
+++ b/src/vmspawn/meson.build
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: LGPL-2.1-or-later
+
+libvmspawn_core_sources = files(
+ 'vmspawn-settings.c',
+ 'vmspawn-util.c',
+)
+libvmspawn_core = static_library(
+ 'vmspawn-core',
+ libvmspawn_core_sources,
+ include_directories : includes,
+ dependencies : [userspace],
+ build_by_default : false)
+
+vmspawn_libs = [
+ libvmspawn_core,
+ libshared,
+]
+
+executables += [
+ executable_template + {
+ 'name' : 'systemd-vmspawn',
+ 'public' : true,
+ 'conditions': ['ENABLE_VMSPAWN'],
+ 'sources' : files('vmspawn.c'),
+ 'link_with' : vmspawn_libs,
+ }
+]
diff --git a/src/vmspawn/vmspawn-settings.c b/src/vmspawn/vmspawn-settings.c
new file mode 100644
index 0000000..cb1a463
--- /dev/null
+++ b/src/vmspawn/vmspawn-settings.c
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include "vmspawn-settings.h"
diff --git a/src/vmspawn/vmspawn-settings.h b/src/vmspawn/vmspawn-settings.h
new file mode 100644
index 0000000..268a874
--- /dev/null
+++ b/src/vmspawn/vmspawn-settings.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdint.h>
+
+typedef enum SettingsMask {
+ SETTING_START_MODE = UINT64_C(1) << 0,
+ SETTING_DIRECTORY = UINT64_C(1) << 26,
+ SETTING_CREDENTIALS = UINT64_C(1) << 30,
+ _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
+} SettingsMask;
diff --git a/src/vmspawn/vmspawn-util.c b/src/vmspawn/vmspawn-util.c
new file mode 100644
index 0000000..b5b5eaf
--- /dev/null
+++ b/src/vmspawn/vmspawn-util.c
@@ -0,0 +1,344 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <unistd.h>
+#include <linux/vhost.h>
+#include <sys/ioctl.h>
+
+#include "architecture.h"
+#include "conf-files.h"
+#include "errno-util.h"
+#include "fd-util.h"
+#include "fileio.h"
+#include "json.h"
+#include "log.h"
+#include "macro.h"
+#include "memory-util.h"
+#include "path-lookup.h"
+#include "path-util.h"
+#include "random-util.h"
+#include "recurse-dir.h"
+#include "siphash24.h"
+#include "socket-util.h"
+#include "sort-util.h"
+#include "string-util.h"
+#include "strv.h"
+#include "vmspawn-util.h"
+
+OvmfConfig* ovmf_config_free(OvmfConfig *config) {
+ if (!config)
+ return NULL;
+
+ free(config->path);
+ free(config->vars);
+ return mfree(config);
+}
+
+int qemu_check_kvm_support(void) {
+ if (access("/dev/kvm", F_OK) >= 0)
+ return true;
+ if (errno == ENOENT) {
+ log_debug_errno(errno, "/dev/kvm not found. Not using KVM acceleration.");
+ return false;
+ }
+ if (errno == EPERM) {
+ log_debug_errno(errno, "Permission denied to access /dev/kvm. Not using KVM acceleration.");
+ return false;
+ }
+
+ return -errno;
+}
+
+int qemu_check_vsock_support(void) {
+ _cleanup_close_ int fd = -EBADF;
+ /* Just using access() will just check if the device node exists, but not whether a
+ * device driver is behind it (this is a common case since systemd-tmpfiles creates
+ * the device node on boot, typically).
+ *
+ * Hence we open() the path to see if there's actually something behind.
+ *
+ * If not this should return ENODEV.
+ */
+
+ fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC);
+ if (fd >= 0)
+ return true;
+ if (errno == ENODEV) {
+ log_debug_errno(errno, "/dev/vhost-vsock device doesn't exist. Not adding a vsock device to the virtual machine.");
+ return false;
+ }
+ if (errno == EPERM) {
+ log_debug_errno(errno, "Permission denied to access /dev/vhost-vsock. Not adding a vsock device to the virtual machine.");
+ return false;
+ }
+
+ return -errno;
+}
+
+/* holds the data retrieved from the QEMU firmware interop JSON data */
+typedef struct FirmwareData {
+ char **features;
+ char *firmware;
+ char *vars;
+} FirmwareData;
+
+static FirmwareData* firmware_data_free(FirmwareData *fwd) {
+ if (!fwd)
+ return NULL;
+
+ fwd->features = strv_free(fwd->features);
+ fwd->firmware = mfree(fwd->firmware);
+ fwd->vars = mfree(fwd->vars);
+
+ return mfree(fwd);
+}
+DEFINE_TRIVIAL_CLEANUP_FUNC(FirmwareData*, firmware_data_free);
+
+static int firmware_executable(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
+ static const JsonDispatch table[] = {
+ { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, firmware), JSON_MANDATORY },
+ { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY },
+ {}
+ };
+
+ return json_dispatch(v, table, 0, userdata);
+}
+
+static int firmware_nvram_template(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
+ static const JsonDispatch table[] = {
+ { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, vars), JSON_MANDATORY },
+ { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY },
+ {}
+ };
+
+ return json_dispatch(v, table, 0, userdata);
+}
+
+static int firmware_mapping(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
+ static const JsonDispatch table[] = {
+ { "device", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY },
+ { "executable", JSON_VARIANT_OBJECT, firmware_executable, 0, JSON_MANDATORY },
+ { "nvram-template", JSON_VARIANT_OBJECT, firmware_nvram_template, 0, JSON_MANDATORY },
+ {}
+ };
+
+ return json_dispatch(v, table, 0, userdata);
+}
+
+int find_ovmf_config(int search_sb, OvmfConfig **ret) {
+ _cleanup_(ovmf_config_freep) OvmfConfig *config = NULL;
+ _cleanup_free_ char *user_firmware_dir = NULL;
+ _cleanup_strv_free_ char **conf_files = NULL;
+ int r;
+
+ /* Search in:
+ * - $XDG_CONFIG_HOME/qemu/firmware
+ * - /etc/qemu/firmware
+ * - /usr/share/qemu/firmware
+ *
+ * Prioritising entries in "more specific" directories
+ */
+
+ r = xdg_user_config_dir(&user_firmware_dir, "/qemu/firmware");
+ if (r < 0)
+ return r;
+
+ r = conf_files_list_strv(&conf_files, ".json", NULL, CONF_FILES_FILTER_MASKED|CONF_FILES_REGULAR,
+ STRV_MAKE_CONST(user_firmware_dir, "/etc/qemu/firmware", "/usr/share/qemu/firmware"));
+ if (r < 0)
+ return log_debug_errno(r, "Failed to list config files: %m");
+
+ STRV_FOREACH(file, conf_files) {
+ _cleanup_(firmware_data_freep) FirmwareData *fwd = NULL;
+ _cleanup_(json_variant_unrefp) JsonVariant *config_json = NULL;
+ _cleanup_free_ char *contents = NULL;
+ size_t contents_sz = 0;
+
+ r = read_full_file(*file, &contents, &contents_sz);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0) {
+ log_debug_errno(r, "Failed to read contents of %s - ignoring: %m", *file);
+ continue;
+ }
+
+ r = json_parse(contents, 0, &config_json, NULL, NULL);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0) {
+ log_debug_errno(r, "Failed to parse the JSON in %s - ignoring: %m", *file);
+ continue;
+ }
+
+ static const JsonDispatch table[] = {
+ { "description", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY },
+ { "interface-types", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY },
+ { "mapping", JSON_VARIANT_OBJECT, firmware_mapping, 0, JSON_MANDATORY },
+ { "targets", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY },
+ { "features", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(FirmwareData, features), JSON_MANDATORY },
+ { "tags", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY },
+ {}
+ };
+
+ fwd = new0(FirmwareData, 1);
+ if (!fwd)
+ return -ENOMEM;
+
+ r = json_dispatch(config_json, table, 0, fwd);
+ if (r == -ENOMEM)
+ return r;
+ if (r < 0) {
+ log_debug_errno(r, "Failed to extract the required fields from the JSON in %s - ignoring: %m", *file);
+ continue;
+ }
+
+ int sb_present = !!strv_find(fwd->features, "secure-boot");
+
+ /* exclude firmware which doesn't match our Secure Boot requirements */
+ if (search_sb >= 0 && search_sb != sb_present) {
+ log_debug("Skipping %s, firmware doesn't fit required Secure Boot configuration", *file);
+ continue;
+ }
+
+ config = new0(OvmfConfig, 1);
+ if (!config)
+ return -ENOMEM;
+
+ config->path = TAKE_PTR(fwd->firmware);
+ config->vars = TAKE_PTR(fwd->vars);
+ config->supports_sb = sb_present;
+ break;
+ }
+
+ if (!config)
+ return -ENOENT;
+
+ if (ret)
+ *ret = TAKE_PTR(config);
+
+ return 0;
+}
+
+int find_qemu_binary(char **ret_qemu_binary) {
+ int r;
+
+ /*
+ * On success the path to the qemu binary will be stored in `req_qemu_binary`
+ *
+ * If the qemu binary cannot be found -ENOENT will be returned.
+ * If the native architecture is not supported by qemu -EOPNOTSUPP will be returned;
+ */
+
+ static const char *architecture_to_qemu_table[_ARCHITECTURE_MAX] = {
+ [ARCHITECTURE_ARM64] = "aarch64", /* differs from our name */
+ [ARCHITECTURE_ARM] = "arm",
+ [ARCHITECTURE_ALPHA] = "alpha",
+ [ARCHITECTURE_X86_64] = "x86_64", /* differs from our name */
+ [ARCHITECTURE_X86] = "i386", /* differs from our name */
+ [ARCHITECTURE_LOONGARCH64] = "loongarch64",
+ [ARCHITECTURE_MIPS64_LE] = "mips", /* differs from our name */
+ [ARCHITECTURE_MIPS_LE] = "mips", /* differs from our name */
+ [ARCHITECTURE_PARISC] = "hppa", /* differs from our name */
+ [ARCHITECTURE_PPC64_LE] = "ppc", /* differs from our name */
+ [ARCHITECTURE_PPC64] = "ppc", /* differs from our name */
+ [ARCHITECTURE_PPC] = "ppc",
+ [ARCHITECTURE_RISCV32] = "riscv32",
+ [ARCHITECTURE_RISCV64] = "riscv64",
+ [ARCHITECTURE_S390X] = "s390x",
+ };
+
+ FOREACH_STRING(s, "qemu", "qemu-kvm") {
+ r = find_executable(s, ret_qemu_binary);
+ if (r == 0)
+ return 0;
+
+ if (r != -ENOENT)
+ return r;
+ }
+
+ const char *arch_qemu = architecture_to_qemu_table[native_architecture()];
+ if (!arch_qemu)
+ return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Architecture %s not supported by qemu", architecture_to_string(native_architecture()));
+
+ _cleanup_free_ char *qemu_arch_specific = NULL;
+ qemu_arch_specific = strjoin("qemu-system-", arch_qemu);
+ if (!qemu_arch_specific)
+ return -ENOMEM;
+
+ return find_executable(qemu_arch_specific, ret_qemu_binary);
+}
+
+int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock) {
+ /* this is an arbitrary value picked from /dev/urandom */
+ static const uint8_t sip_key[HASH_KEY_SIZE] = {
+ 0x03, 0xad, 0xf0, 0xa4,
+ 0x59, 0x2c, 0x77, 0x11,
+ 0xda, 0x39, 0x0c, 0xba,
+ 0xf5, 0x4c, 0x80, 0x52
+ };
+ struct siphash machine_hash_state, state;
+ _cleanup_close_ int vfd = -EBADF;
+ int r;
+
+ /* uint64_t is required here for the ioctl call, but valid CIDs are only 32 bits */
+ uint64_t cid = *ASSERT_PTR(machine_cid);
+
+ assert(machine);
+ assert(ret_child_sock);
+
+ /* Fix the CID of the AF_VSOCK socket passed to qemu
+ *
+ * If the user has passed us a CID (machine_cid != VMADDR_CID_ANY), then attempt to bind to that CID
+ * and error if we cannot.
+ *
+ * Otherwise hash the machine name to get a random CID and attempt to bind to that.
+ * If it is occupied add more information into the hash and try again.
+ * If after 64 attempts this hasn't worked fallback to truly random CIDs.
+ * If after another 64 attempts this hasn't worked then give up and return EADDRNOTAVAIL.
+ */
+
+ /* remove O_CLOEXEC before this fd is passed to QEMU */
+ vfd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC);
+ if (vfd < 0)
+ return log_debug_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m");
+
+ if (cid != VMADDR_CID_ANY) {
+ r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid);
+ if (r < 0)
+ return log_debug_errno(errno, "Failed to set CID for child vsock with user provided CID %" PRIu64 ": %m", cid);
+ *ret_child_sock = TAKE_FD(vfd);
+ return 0;
+ }
+
+ siphash24_init(&machine_hash_state, sip_key);
+ siphash24_compress_string(machine, &machine_hash_state);
+ for (unsigned i = 0; i < 64; i++) {
+ state = machine_hash_state;
+ siphash24_compress_safe(&i, sizeof i, &state);
+ uint64_t hash = siphash24_finalize(&state);
+
+ cid = 3 + (hash % (UINT_MAX - 4));
+ r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid);
+ if (r >= 0) {
+ *machine_cid = cid;
+ *ret_child_sock = TAKE_FD(vfd);
+ return 0;
+ }
+ if (errno != EADDRINUSE)
+ return -errno;
+ }
+
+ for (unsigned i = 0; i < 64; i++) {
+ cid = 3 + random_u64_range(UINT_MAX - 4);
+ r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid);
+ if (r >= 0) {
+ *machine_cid = cid;
+ *ret_child_sock = TAKE_FD(vfd);
+ return 0;
+ }
+
+ if (errno != EADDRINUSE)
+ return -errno;
+ }
+
+ return log_debug_errno(SYNTHETIC_ERRNO(EADDRNOTAVAIL), "Failed to assign a CID to the guest vsock");
+}
diff --git a/src/vmspawn/vmspawn-util.h b/src/vmspawn/vmspawn-util.h
new file mode 100644
index 0000000..53ad7dd
--- /dev/null
+++ b/src/vmspawn/vmspawn-util.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+#pragma once
+
+#include <stdbool.h>
+#include "macro.h"
+
+#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)
+#define ARCHITECTURE_SUPPORTS_SMBIOS 1
+#else
+#define ARCHITECTURE_SUPPORTS_SMBIOS 0
+#endif
+
+typedef struct OvmfConfig {
+ char *path;
+ char *vars;
+ bool supports_sb;
+} OvmfConfig;
+
+OvmfConfig* ovmf_config_free(OvmfConfig *ovmf_config);
+DEFINE_TRIVIAL_CLEANUP_FUNC(OvmfConfig*, ovmf_config_free);
+
+int qemu_check_kvm_support(void);
+int qemu_check_vsock_support(void);
+int find_ovmf_config(int search_sb, OvmfConfig **ret_ovmf_config);
+int find_qemu_binary(char **ret_qemu_binary);
+int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock);
diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c
new file mode 100644
index 0000000..ebae681
--- /dev/null
+++ b/src/vmspawn/vmspawn.c
@@ -0,0 +1,766 @@
+/* SPDX-License-Identifier: LGPL-2.1-or-later */
+
+#include <getopt.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "alloc-util.h"
+#include "architecture.h"
+#include "build.h"
+#include "common-signal.h"
+#include "copy.h"
+#include "creds-util.h"
+#include "escape.h"
+#include "fileio.h"
+#include "format-util.h"
+#include "fs-util.h"
+#include "hexdecoct.h"
+#include "hostname-util.h"
+#include "log.h"
+#include "machine-credential.h"
+#include "main-func.h"
+#include "pager.h"
+#include "parse-argument.h"
+#include "parse-util.h"
+#include "path-util.h"
+#include "pretty-print.h"
+#include "process-util.h"
+#include "sd-event.h"
+#include "signal-util.h"
+#include "socket-util.h"
+#include "strv.h"
+#include "tmpfile-util.h"
+#include "vmspawn-settings.h"
+#include "vmspawn-util.h"
+
+static PagerFlags arg_pager_flags = 0;
+static char *arg_image = NULL;
+static char *arg_machine = NULL;
+static char *arg_qemu_smp = NULL;
+static uint64_t arg_qemu_mem = 2ULL * 1024ULL * 1024ULL * 1024ULL;
+static int arg_qemu_kvm = -1;
+static int arg_qemu_vsock = -1;
+static uint64_t arg_vsock_cid = UINT64_MAX;
+static bool arg_qemu_gui = false;
+static int arg_secure_boot = -1;
+static MachineCredential *arg_credentials = NULL;
+static size_t arg_n_credentials = 0;
+static SettingsMask arg_settings_mask = 0;
+static char **arg_parameters = NULL;
+
+STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_qemu_smp, freep);
+STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
+
+static int help(void) {
+ _cleanup_free_ char *link = NULL;
+ int r;
+
+ pager_open(arg_pager_flags);
+
+ r = terminal_urlify_man("systemd-vmspawn", "1", &link);
+ if (r < 0)
+ return log_oom();
+
+ printf("%1$s [OPTIONS...] [ARGUMENTS...]\n\n"
+ "%5$sSpawn a command or OS in a virtual machine.%6$s\n\n"
+ " -h --help Show this help\n"
+ " --version Print version string\n"
+ " --no-pager Do not pipe output into a pager\n\n"
+ "%3$sImage:%4$s\n"
+ " -i --image=PATH Root file system disk image (or device node) for\n"
+ " the virtual machine\n\n"
+ "%3$sHost Configuration:%4$s\n"
+ " --qemu-smp=SMP Configure guest's SMP settings\n"
+ " --qemu-mem=MEM Configure guest's RAM size\n"
+ " --qemu-kvm=BOOL Configure whether to use KVM or not\n"
+ " --qemu-vsock=BOOL Configure whether to use qemu with a vsock or not\n"
+ " --vsock-cid= Specify the CID to use for the qemu guest's vsock\n"
+ " --qemu-gui Start QEMU in graphical mode\n"
+ " --secure-boot=BOOL Configure whether to search for firmware which\n"
+ " supports Secure Boot\n\n"
+ "%3$sSystem Identity:%4$s\n"
+ " -M --machine=NAME Set the machine name for the container\n"
+ "%3$sCredentials:%4$s\n"
+ " --set-credential=ID:VALUE\n"
+ " Pass a credential with literal value to container.\n"
+ " --load-credential=ID:PATH\n"
+ " Load credential to pass to container from file or\n"
+ " AF_UNIX stream socket.\n"
+ "\nSee the %2$s for details.\n",
+ program_invocation_short_name,
+ link,
+ ansi_underline(),
+ ansi_normal(),
+ ansi_highlight(),
+ ansi_normal());
+
+ return 0;
+}
+
+static int parse_argv(int argc, char *argv[]) {
+ enum {
+ ARG_VERSION = 0x100,
+ ARG_NO_PAGER,
+ ARG_QEMU_SMP,
+ ARG_QEMU_MEM,
+ ARG_QEMU_KVM,
+ ARG_QEMU_VSOCK,
+ ARG_VSOCK_CID,
+ ARG_QEMU_GUI,
+ ARG_SECURE_BOOT,
+ ARG_SET_CREDENTIAL,
+ ARG_LOAD_CREDENTIAL,
+ };
+
+ static const struct option options[] = {
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, ARG_VERSION },
+ { "no-pager", no_argument, NULL, ARG_NO_PAGER },
+ { "image", required_argument, NULL, 'i' },
+ { "machine", required_argument, NULL, 'M' },
+ { "qemu-smp", required_argument, NULL, ARG_QEMU_SMP },
+ { "qemu-mem", required_argument, NULL, ARG_QEMU_MEM },
+ { "qemu-kvm", required_argument, NULL, ARG_QEMU_KVM },
+ { "qemu-vsock", required_argument, NULL, ARG_QEMU_VSOCK },
+ { "vsock-cid", required_argument, NULL, ARG_VSOCK_CID },
+ { "qemu-gui", no_argument, NULL, ARG_QEMU_GUI },
+ { "secure-boot", required_argument, NULL, ARG_SECURE_BOOT },
+ { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
+ { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
+ {}
+ };
+
+ int c, r;
+
+ assert(argc >= 0);
+ assert(argv);
+
+ optind = 0;
+ while ((c = getopt_long(argc, argv, "+hi:M", options, NULL)) >= 0)
+ switch (c) {
+ case 'h':
+ return help();
+
+ case ARG_VERSION:
+ return version();
+
+ case 'i':
+ r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image);
+ if (r < 0)
+ return r;
+
+ arg_settings_mask |= SETTING_DIRECTORY;
+ break;
+
+ case 'M':
+ if (isempty(optarg))
+ arg_machine = mfree(arg_machine);
+ else {
+ if (!hostname_is_valid(optarg, 0))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
+ "Invalid machine name: %s", optarg);
+
+ r = free_and_strdup(&arg_machine, optarg);
+ if (r < 0)
+ return log_oom();
+ }
+ break;
+
+ case ARG_NO_PAGER:
+ arg_pager_flags |= PAGER_DISABLE;
+ break;
+
+ case ARG_QEMU_SMP:
+ r = free_and_strdup_warn(&arg_qemu_smp, optarg);
+ if (r < 0)
+ return r;
+ break;
+
+ case ARG_QEMU_MEM:
+ r = parse_size(optarg, 1024, &arg_qemu_mem);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --qemu-mem=%s: %m", optarg);
+ break;
+
+ case ARG_QEMU_KVM:
+ r = parse_tristate(optarg, &arg_qemu_kvm);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --qemu-kvm=%s: %m", optarg);
+ break;
+
+ case ARG_QEMU_VSOCK:
+ r = parse_tristate(optarg, &arg_qemu_vsock);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --qemu-vsock=%s: %m", optarg);
+ break;
+
+ case ARG_VSOCK_CID: {
+ unsigned cid;
+ if (isempty(optarg))
+ cid = VMADDR_CID_ANY;
+ else {
+ r = safe_atou_bounded(optarg, 3, UINT_MAX - 1, &cid);
+ if (r == -ERANGE)
+ return log_error_errno(r, "Invalid value for --vsock-cid=: %m");
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --vsock-cid=%s: %m", optarg);
+ }
+ arg_vsock_cid = (uint64_t)cid;
+ break;
+ }
+
+ case ARG_QEMU_GUI:
+ arg_qemu_gui = true;
+ break;
+
+ case ARG_SECURE_BOOT:
+ r = parse_tristate(optarg, &arg_secure_boot);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse --secure-boot=%s: %m", optarg);
+ break;
+
+ case ARG_SET_CREDENTIAL: {
+ r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg);
+ if (r < 0)
+ return r;
+ arg_settings_mask |= SETTING_CREDENTIALS;
+ break;
+ }
+
+ case ARG_LOAD_CREDENTIAL: {
+ r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg);
+ if (r < 0)
+ return r;
+
+ arg_settings_mask |= SETTING_CREDENTIALS;
+ break;
+ }
+
+ case '?':
+ return -EINVAL;
+
+ default:
+ assert_not_reached();
+ }
+
+ if (argc > optind) {
+ strv_free(arg_parameters);
+ arg_parameters = strv_copy(argv + optind);
+ if (!arg_parameters)
+ return log_oom();
+
+ arg_settings_mask |= SETTING_START_MODE;
+ }
+
+ return 1;
+}
+
+static int open_vsock(void) {
+ _cleanup_close_ int vsock_fd = -EBADF;
+ int r;
+ static const union sockaddr_union bind_addr = {
+ .vm.svm_family = AF_VSOCK,
+ .vm.svm_cid = VMADDR_CID_ANY,
+ .vm.svm_port = VMADDR_PORT_ANY,
+ };
+
+ vsock_fd = socket(AF_VSOCK, SOCK_STREAM|SOCK_CLOEXEC, 0);
+ if (vsock_fd < 0)
+ return log_error_errno(errno, "Failed to open AF_VSOCK socket: %m");
+
+ r = bind(vsock_fd, &bind_addr.sa, sizeof(bind_addr.vm));
+ if (r < 0)
+ return log_error_errno(errno, "Failed to bind to vsock to address %u:%u: %m", bind_addr.vm.svm_cid, bind_addr.vm.svm_port);
+
+ r = listen(vsock_fd, SOMAXCONN_DELUXE);
+ if (r < 0)
+ return log_error_errno(errno, "Failed to listen on vsock: %m");
+
+ return TAKE_FD(vsock_fd);
+}
+
+static int vmspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ char buf[NOTIFY_BUFFER_MAX+1];
+ const char *p = NULL;
+ struct iovec iovec = {
+ .iov_base = buf,
+ .iov_len = sizeof(buf)-1,
+ };
+ struct msghdr msghdr = {
+ .msg_iov = &iovec,
+ .msg_iovlen = 1,
+ };
+ ssize_t n;
+ _cleanup_strv_free_ char **tags = NULL;
+ int r, *exit_status = ASSERT_PTR(userdata);
+
+ n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT);
+ if (ERRNO_IS_NEG_TRANSIENT(n))
+ return 0;
+ if (n == -EXFULL) {
+ log_warning_errno(n, "Got message with truncated control data, ignoring: %m");
+ return 0;
+ }
+ if (n < 0)
+ return log_warning_errno(n, "Couldn't read notification socket: %m");
+
+ if ((size_t) n >= sizeof(buf)) {
+ log_warning("Received notify message exceeded maximum size. Ignoring.");
+ return 0;
+ }
+
+ buf[n] = 0;
+ tags = strv_split(buf, "\n\r");
+ if (!tags)
+ return log_oom();
+
+ STRV_FOREACH(s, tags)
+ log_debug("Received tag %s from notify socket", *s);
+
+ if (strv_contains(tags, "READY=1")) {
+ r = sd_notify(false, "READY=1\n");
+ if (r < 0)
+ log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
+ }
+
+ p = strv_find_startswith(tags, "STATUS=");
+ if (p)
+ (void) sd_notifyf(false, "STATUS=VM running: %s", p);
+
+ p = strv_find_startswith(tags, "EXIT_STATUS=");
+ if (p) {
+ r = safe_atoi(p, exit_status);
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse exit status from %s, ignoring: %m", p);
+ }
+
+ /* we will only receive one message from each connection so disable this source once one is received */
+ source = sd_event_source_disable_unref(source);
+
+ return 0;
+}
+
+static int vmspawn_dispatch_vsock_connections(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
+ int r;
+ sd_event *event;
+ _cleanup_close_ int conn_fd = -EBADF;
+
+ assert(userdata);
+
+ if (revents != EPOLLIN) {
+ log_warning("Got unexpected poll event for vsock fd.");
+ return 0;
+ }
+
+ conn_fd = accept4(fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK);
+ if (conn_fd < 0) {
+ log_warning_errno(errno, "Failed to accept connection from vsock fd (%m), ignoring...");
+ return 0;
+ }
+
+ event = sd_event_source_get_event(source);
+ if (!event)
+ return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to retrieve event from event source, exiting task");
+
+ /* add a new floating task to read from the connection */
+ r = sd_event_add_io(event, NULL, conn_fd, revents, vmspawn_dispatch_notify_fd, userdata);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify connection event source: %m");
+
+ /* conn_fd is now owned by the event loop so don't clean it up */
+ TAKE_FD(conn_fd);
+
+ return 0;
+}
+
+static int setup_notify_parent(sd_event *event, int fd, int *exit_status, sd_event_source **notify_event_source) {
+ int r;
+
+ r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, vmspawn_dispatch_vsock_connections, exit_status);
+ if (r < 0)
+ return log_error_errno(r, "Failed to allocate notify socket event source: %m");
+
+ (void) sd_event_source_set_description(*notify_event_source, "vmspawn-notify-sock");
+
+ return 0;
+}
+
+static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
+ pid_t pid;
+
+ pid = PTR_TO_PID(userdata);
+ if (pid > 0) {
+ /* TODO: actually talk to qemu and ask the guest to shutdown here */
+ if (kill(pid, SIGKILL) >= 0) {
+ log_info("Trying to halt qemu. Send SIGTERM again to trigger vmspawn to immediately terminate.");
+ sd_event_source_set_userdata(s, NULL);
+ return 0;
+ }
+ }
+
+ sd_event_exit(sd_event_source_get_event(s), 0);
+ return 0;
+}
+
+static int on_child_exit(sd_event_source *s, const siginfo_t *si, void *userdata) {
+ sd_event_exit(sd_event_source_get_event(s), 0);
+ return 0;
+}
+
+static int cmdline_add_vsock(char ***cmdline, int vsock_fd) {
+ int r;
+
+ r = strv_extend(cmdline, "-smbios");
+ if (r < 0)
+ return r;
+
+ union sockaddr_union addr;
+ socklen_t addr_len = sizeof addr.vm;
+ r = getsockname(vsock_fd, &addr.sa, &addr_len);
+ if (r < 0)
+ return -errno;
+ assert(addr_len >= sizeof addr.vm);
+ assert(addr.vm.svm_family == AF_VSOCK);
+
+ log_info("Using vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port);
+ r = strv_extendf(cmdline, "type=11,value=io.systemd.credential:vmm.notify_socket=vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port);
+ if (r < 0)
+ return r;
+
+ return 0;
+}
+
+static int run_virtual_machine(void) {
+ _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL;
+ _cleanup_strv_free_ char **cmdline = NULL;
+ _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL;
+ int r;
+ _cleanup_close_ int vsock_fd = -EBADF;
+
+ bool use_kvm = arg_qemu_kvm > 0;
+ if (arg_qemu_kvm < 0) {
+ r = qemu_check_kvm_support();
+ if (r < 0)
+ return log_error_errno(r, "Failed to check for KVM support: %m");
+ use_kvm = r;
+ }
+
+ r = find_ovmf_config(arg_secure_boot, &ovmf_config);
+ if (r < 0)
+ return log_error_errno(r, "Failed to find OVMF config: %m");
+
+ /* only warn if the user hasn't disabled secureboot */
+ if (!ovmf_config->supports_sb && arg_secure_boot)
+ log_warning("Couldn't find OVMF firmware blob with Secure Boot support, "
+ "falling back to OVMF firmware blobs without Secure Boot support.");
+
+ const char *accel = use_kvm ? "kvm" : "tcg";
+ if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE))
+ machine = strjoin("type=virt,accel=", accel);
+ else
+ machine = strjoin("type=q35,accel=", accel, ",smm=", on_off(ovmf_config->supports_sb));
+ if (!machine)
+ return log_oom();
+
+ r = find_qemu_binary(&qemu_binary);
+ if (r == -EOPNOTSUPP)
+ return log_error_errno(r, "Native architecture is not supported by qemu.");
+ if (r < 0)
+ return log_error_errno(r, "Failed to find QEMU binary: %m");
+
+ if (asprintf(&mem, "%.4fM", (double)arg_qemu_mem / (1024.0 * 1024.0)) < 0)
+ return log_oom();
+
+ cmdline = strv_new(
+ qemu_binary,
+ "-machine", machine,
+ "-smp", arg_qemu_smp ?: "1",
+ "-m", mem,
+ "-object", "rng-random,filename=/dev/urandom,id=rng0",
+ "-device", "virtio-rng-pci,rng=rng0,id=rng-device0",
+ "-nic", "user,model=virtio-net-pci"
+ );
+ if (!cmdline)
+ return log_oom();
+
+ bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS;
+ if (arg_qemu_vsock < 0) {
+ r = qemu_check_vsock_support();
+ if (r < 0)
+ return log_error_errno(r, "Failed to check for VSock support: %m");
+
+ use_vsock = r;
+ }
+
+ unsigned child_cid = VMADDR_CID_ANY;
+ _cleanup_close_ int child_vsock_fd = -EBADF;
+ if (use_vsock) {
+ if (arg_vsock_cid < UINT_MAX)
+ child_cid = (unsigned)arg_vsock_cid;
+
+ r = vsock_fix_child_cid(&child_cid, arg_machine, &child_vsock_fd);
+ if (r < 0)
+ return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m");
+
+ r = strv_extend(&cmdline, "-device");
+ if (r < 0)
+ return log_oom();
+
+ log_debug("vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd);
+ r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd);
+ if (r < 0)
+ return log_oom();
+ }
+
+ r = strv_extend_strv(&cmdline, STRV_MAKE("-cpu", "max"), /* filter_duplicates= */ false);
+ if (r < 0)
+ return log_oom();
+
+ if (arg_qemu_gui) {
+ r = strv_extend_strv(&cmdline, STRV_MAKE("-vga", "virtio"), /* filter_duplicates= */ false);
+ if (r < 0)
+ return log_oom();
+ } else {
+ r = strv_extend_strv(&cmdline, STRV_MAKE(
+ "-nographic",
+ "-nodefaults",
+ "-chardev", "stdio,mux=on,id=console,signal=off",
+ "-serial", "chardev:console",
+ "-mon", "console"
+ ), /* filter_duplicates= */ false);
+ if (r < 0)
+ return log_oom();
+ }
+
+ if (ARCHITECTURE_SUPPORTS_SMBIOS) {
+ ssize_t n;
+ FOREACH_ARRAY(cred, arg_credentials, arg_n_credentials) {
+ _cleanup_free_ char *cred_data_b64 = NULL;
+
+ n = base64mem(cred->data, cred->size, &cred_data_b64);
+ if (n < 0)
+ return log_oom();
+
+ r = strv_extend(&cmdline, "-smbios");
+ if (r < 0)
+ return log_oom();
+
+ r = strv_extendf(&cmdline, "type=11,value=io.systemd.credential.binary:%s=%s", cred->id, cred_data_b64);
+ if (r < 0)
+ return log_oom();
+ }
+ }
+
+ r = strv_extend(&cmdline, "-drive");
+ if (r < 0)
+ return log_oom();
+
+ r = strv_extendf(&cmdline, "if=pflash,format=raw,readonly=on,file=%s", ovmf_config->path);
+ if (r < 0)
+ return log_oom();
+
+ _cleanup_(unlink_and_freep) char *ovmf_vars_to = NULL;
+ if (ovmf_config->supports_sb) {
+ const char *ovmf_vars_from = ovmf_config->vars;
+ _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF;
+
+ r = tempfn_random_child(NULL, "vmspawn-", &ovmf_vars_to);
+ if (r < 0)
+ return r;
+
+ source_fd = open(ovmf_vars_from, O_RDONLY|O_CLOEXEC);
+ if (source_fd < 0)
+ return log_error_errno(source_fd, "Failed to open OVMF vars file %s: %m", ovmf_vars_from);
+
+ target_fd = open(ovmf_vars_to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
+ if (target_fd < 0)
+ return log_error_errno(errno, "Failed to create regular file for OVMF vars at %s: %m", ovmf_vars_to);
+
+ r = copy_bytes(source_fd, target_fd, UINT64_MAX, COPY_REFLINK);
+ if (r < 0)
+ return log_error_errno(r, "Failed to copy bytes from %s to %s: %m", ovmf_vars_from, ovmf_vars_to);
+
+ /* These aren't always available so don't raise an error if they fail */
+ (void) copy_xattr(source_fd, NULL, target_fd, NULL, 0);
+ (void) copy_access(source_fd, target_fd);
+ (void) copy_times(source_fd, target_fd, 0);
+
+ r = strv_extend_strv(&cmdline, STRV_MAKE(
+ "-global", "ICH9-LPC.disable_s3=1",
+ "-global", "driver=cfi.pflash01,property=secure,value=on",
+ "-drive"
+ ), /* filter_duplicates= */ false);
+ if (r < 0)
+ return log_oom();
+
+ r = strv_extendf(&cmdline, "file=%s,if=pflash,format=raw", ovmf_vars_to);
+ if (r < 0)
+ return log_oom();
+ }
+
+ r = strv_extend(&cmdline, "-drive");
+ if (r < 0)
+ return log_oom();
+
+ r = strv_extendf(&cmdline, "if=none,id=mkosi,file=%s,format=raw", arg_image);
+ if (r < 0)
+ return log_oom();
+
+ r = strv_extend_strv(&cmdline, STRV_MAKE(
+ "-device", "virtio-scsi-pci,id=scsi",
+ "-device", "scsi-hd,drive=mkosi,bootindex=1"
+ ), /* filter_duplicates= */ false);
+ if (r < 0)
+ return log_oom();
+
+ if (!strv_isempty(arg_parameters)) {
+ if (ARCHITECTURE_SUPPORTS_SMBIOS) {
+ _cleanup_free_ char *kcl = strv_join(arg_parameters, " ");
+ if (!kcl)
+ return log_oom();
+
+ r = strv_extend(&cmdline, "-smbios");
+ if (r < 0)
+ return log_oom();
+
+ r = strv_extendf(&cmdline, "type=11,value=io.systemd.stub.kernel-cmdline-extra=%s", kcl);
+ if (r < 0)
+ return log_oom();
+ } else
+ log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS");
+ }
+
+ if (use_vsock) {
+ vsock_fd = open_vsock();
+ if (vsock_fd < 0)
+ return log_error_errno(vsock_fd, "Failed to open vsock: %m");
+
+ r = cmdline_add_vsock(&cmdline, vsock_fd);
+ if (r == -ENOMEM)
+ return log_oom();
+ if (r < 0)
+ return log_error_errno(r, "Failed to call getsockname on vsock: %m");
+ }
+
+ _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
+ _cleanup_(sd_event_unrefp) sd_event *event = NULL;
+ r = sd_event_new(&event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to get default event source: %m");
+
+ (void) sd_event_set_watchdog(event, true);
+
+ pid_t child_pid;
+ r = safe_fork_full(
+ qemu_binary,
+ NULL,
+ &child_vsock_fd, 1, /* pass the vsock fd to qemu */
+ FORK_CLOEXEC_OFF,
+ &child_pid);
+ if (r < 0)
+ return log_error_errno(r, "Failed to fork off %s: %m", qemu_binary);
+ if (r == 0) {
+ /* set TERM and LANG if they are missing */
+ if (setenv("TERM", "vt220", 0) < 0)
+ return log_oom();
+
+ if (setenv("LANG", "C.UTF-8", 0) < 0)
+ return log_oom();
+
+ execve(qemu_binary, cmdline, environ);
+ log_error_errno(errno, "Failed to execve %s: %m", qemu_binary);
+ _exit(EXIT_FAILURE);
+ }
+
+
+ int exit_status = INT_MAX;
+ if (use_vsock) {
+ r = setup_notify_parent(event, vsock_fd, &exit_status, &notify_event_source);
+ if (r < 0)
+ return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m");
+ }
+
+ /* shutdown qemu when we are shutdown */
+ (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(child_pid));
+ (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(child_pid));
+
+ (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL);
+
+ /* Exit when the child exits */
+ (void) sd_event_add_child(event, NULL, child_pid, WEXITED, on_child_exit, NULL);
+
+ r = sd_event_loop(event);
+ if (r < 0)
+ return log_error_errno(r, "Failed to run event loop: %m");
+
+ if (use_vsock) {
+ if (exit_status == INT_MAX) {
+ log_debug("Couldn't retrieve inner EXIT_STATUS from vsock");
+ return EXIT_SUCCESS;
+ }
+ if (exit_status != 0)
+ log_warning("Non-zero exit code received: %d", exit_status);
+ return exit_status;
+ }
+
+ return 0;
+}
+
+static int determine_names(void) {
+ int r;
+
+ if (!arg_image)
+ return log_error_errno(SYNTHETIC_ERRNO(-EINVAL), "Missing required argument -i/--image=, quitting");
+
+ if (!arg_machine) {
+ char *e;
+
+ r = path_extract_filename(arg_image, &arg_machine);
+ if (r < 0)
+ return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image);
+
+ /* Truncate suffix if there is one */
+ e = endswith(arg_machine, ".raw");
+ if (e)
+ *e = 0;
+
+ hostname_cleanup(arg_machine);
+ if (!hostname_is_valid(arg_machine, 0))
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
+ }
+
+ return 0;
+}
+
+static int run(int argc, char *argv[]) {
+ int r, ret = EXIT_SUCCESS;
+
+ log_setup();
+
+ r = parse_argv(argc, argv);
+ if (r <= 0)
+ goto finish;
+
+ r = determine_names();
+ if (r < 0)
+ goto finish;
+
+ assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0);
+
+ r = run_virtual_machine();
+ if (r > 0)
+ ret = r;
+finish:
+ machine_credential_free_all(arg_credentials, arg_n_credentials);
+
+ if (r < 0)
+ return r;
+
+ return ret;
+}
+
+DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);