diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 20:49:52 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 20:49:52 +0000 |
commit | 55944e5e40b1be2afc4855d8d2baf4b73d1876b5 (patch) | |
tree | 33f869f55a1b149e9b7c2b7e201867ca5dd52992 /src/vmspawn | |
parent | Initial commit. (diff) | |
download | systemd-55944e5e40b1be2afc4855d8d2baf4b73d1876b5.tar.xz systemd-55944e5e40b1be2afc4855d8d2baf4b73d1876b5.zip |
Adding upstream version 255.4.upstream/255.4
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/vmspawn')
-rw-r--r-- | src/vmspawn/meson.build | 27 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-settings.c | 3 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-settings.h | 11 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-util.c | 344 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-util.h | 26 | ||||
-rw-r--r-- | src/vmspawn/vmspawn.c | 766 |
6 files changed, 1177 insertions, 0 deletions
diff --git a/src/vmspawn/meson.build b/src/vmspawn/meson.build new file mode 100644 index 0000000..800d7c3 --- /dev/null +++ b/src/vmspawn/meson.build @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +libvmspawn_core_sources = files( + 'vmspawn-settings.c', + 'vmspawn-util.c', +) +libvmspawn_core = static_library( + 'vmspawn-core', + libvmspawn_core_sources, + include_directories : includes, + dependencies : [userspace], + build_by_default : false) + +vmspawn_libs = [ + libvmspawn_core, + libshared, +] + +executables += [ + executable_template + { + 'name' : 'systemd-vmspawn', + 'public' : true, + 'conditions': ['ENABLE_VMSPAWN'], + 'sources' : files('vmspawn.c'), + 'link_with' : vmspawn_libs, + } +] diff --git a/src/vmspawn/vmspawn-settings.c b/src/vmspawn/vmspawn-settings.c new file mode 100644 index 0000000..cb1a463 --- /dev/null +++ b/src/vmspawn/vmspawn-settings.c @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "vmspawn-settings.h" diff --git a/src/vmspawn/vmspawn-settings.h b/src/vmspawn/vmspawn-settings.h new file mode 100644 index 0000000..268a874 --- /dev/null +++ b/src/vmspawn/vmspawn-settings.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdint.h> + +typedef enum SettingsMask { + SETTING_START_MODE = UINT64_C(1) << 0, + SETTING_DIRECTORY = UINT64_C(1) << 26, + SETTING_CREDENTIALS = UINT64_C(1) << 30, + _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX +} SettingsMask; diff --git a/src/vmspawn/vmspawn-util.c b/src/vmspawn/vmspawn-util.c new file mode 100644 index 0000000..b5b5eaf --- /dev/null +++ b/src/vmspawn/vmspawn-util.c @@ -0,0 +1,344 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <unistd.h> +#include <linux/vhost.h> +#include <sys/ioctl.h> + +#include "architecture.h" +#include "conf-files.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "json.h" +#include "log.h" +#include "macro.h" +#include "memory-util.h" +#include "path-lookup.h" +#include "path-util.h" +#include "random-util.h" +#include "recurse-dir.h" +#include "siphash24.h" +#include "socket-util.h" +#include "sort-util.h" +#include "string-util.h" +#include "strv.h" +#include "vmspawn-util.h" + +OvmfConfig* ovmf_config_free(OvmfConfig *config) { + if (!config) + return NULL; + + free(config->path); + free(config->vars); + return mfree(config); +} + +int qemu_check_kvm_support(void) { + if (access("/dev/kvm", F_OK) >= 0) + return true; + if (errno == ENOENT) { + log_debug_errno(errno, "/dev/kvm not found. Not using KVM acceleration."); + return false; + } + if (errno == EPERM) { + log_debug_errno(errno, "Permission denied to access /dev/kvm. Not using KVM acceleration."); + return false; + } + + return -errno; +} + +int qemu_check_vsock_support(void) { + _cleanup_close_ int fd = -EBADF; + /* Just using access() will just check if the device node exists, but not whether a + * device driver is behind it (this is a common case since systemd-tmpfiles creates + * the device node on boot, typically). + * + * Hence we open() the path to see if there's actually something behind. + * + * If not this should return ENODEV. + */ + + fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); + if (fd >= 0) + return true; + if (errno == ENODEV) { + log_debug_errno(errno, "/dev/vhost-vsock device doesn't exist. Not adding a vsock device to the virtual machine."); + return false; + } + if (errno == EPERM) { + log_debug_errno(errno, "Permission denied to access /dev/vhost-vsock. Not adding a vsock device to the virtual machine."); + return false; + } + + return -errno; +} + +/* holds the data retrieved from the QEMU firmware interop JSON data */ +typedef struct FirmwareData { + char **features; + char *firmware; + char *vars; +} FirmwareData; + +static FirmwareData* firmware_data_free(FirmwareData *fwd) { + if (!fwd) + return NULL; + + fwd->features = strv_free(fwd->features); + fwd->firmware = mfree(fwd->firmware); + fwd->vars = mfree(fwd->vars); + + return mfree(fwd); +} +DEFINE_TRIVIAL_CLEANUP_FUNC(FirmwareData*, firmware_data_free); + +static int firmware_executable(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, firmware), JSON_MANDATORY }, + { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, 0, userdata); +} + +static int firmware_nvram_template(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, vars), JSON_MANDATORY }, + { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, 0, userdata); +} + +static int firmware_mapping(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "device", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "executable", JSON_VARIANT_OBJECT, firmware_executable, 0, JSON_MANDATORY }, + { "nvram-template", JSON_VARIANT_OBJECT, firmware_nvram_template, 0, JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, 0, userdata); +} + +int find_ovmf_config(int search_sb, OvmfConfig **ret) { + _cleanup_(ovmf_config_freep) OvmfConfig *config = NULL; + _cleanup_free_ char *user_firmware_dir = NULL; + _cleanup_strv_free_ char **conf_files = NULL; + int r; + + /* Search in: + * - $XDG_CONFIG_HOME/qemu/firmware + * - /etc/qemu/firmware + * - /usr/share/qemu/firmware + * + * Prioritising entries in "more specific" directories + */ + + r = xdg_user_config_dir(&user_firmware_dir, "/qemu/firmware"); + if (r < 0) + return r; + + r = conf_files_list_strv(&conf_files, ".json", NULL, CONF_FILES_FILTER_MASKED|CONF_FILES_REGULAR, + STRV_MAKE_CONST(user_firmware_dir, "/etc/qemu/firmware", "/usr/share/qemu/firmware")); + if (r < 0) + return log_debug_errno(r, "Failed to list config files: %m"); + + STRV_FOREACH(file, conf_files) { + _cleanup_(firmware_data_freep) FirmwareData *fwd = NULL; + _cleanup_(json_variant_unrefp) JsonVariant *config_json = NULL; + _cleanup_free_ char *contents = NULL; + size_t contents_sz = 0; + + r = read_full_file(*file, &contents, &contents_sz); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to read contents of %s - ignoring: %m", *file); + continue; + } + + r = json_parse(contents, 0, &config_json, NULL, NULL); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to parse the JSON in %s - ignoring: %m", *file); + continue; + } + + static const JsonDispatch table[] = { + { "description", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "interface-types", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + { "mapping", JSON_VARIANT_OBJECT, firmware_mapping, 0, JSON_MANDATORY }, + { "targets", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + { "features", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(FirmwareData, features), JSON_MANDATORY }, + { "tags", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + {} + }; + + fwd = new0(FirmwareData, 1); + if (!fwd) + return -ENOMEM; + + r = json_dispatch(config_json, table, 0, fwd); + if (r == -ENOMEM) + return r; + if (r < 0) { + log_debug_errno(r, "Failed to extract the required fields from the JSON in %s - ignoring: %m", *file); + continue; + } + + int sb_present = !!strv_find(fwd->features, "secure-boot"); + + /* exclude firmware which doesn't match our Secure Boot requirements */ + if (search_sb >= 0 && search_sb != sb_present) { + log_debug("Skipping %s, firmware doesn't fit required Secure Boot configuration", *file); + continue; + } + + config = new0(OvmfConfig, 1); + if (!config) + return -ENOMEM; + + config->path = TAKE_PTR(fwd->firmware); + config->vars = TAKE_PTR(fwd->vars); + config->supports_sb = sb_present; + break; + } + + if (!config) + return -ENOENT; + + if (ret) + *ret = TAKE_PTR(config); + + return 0; +} + +int find_qemu_binary(char **ret_qemu_binary) { + int r; + + /* + * On success the path to the qemu binary will be stored in `req_qemu_binary` + * + * If the qemu binary cannot be found -ENOENT will be returned. + * If the native architecture is not supported by qemu -EOPNOTSUPP will be returned; + */ + + static const char *architecture_to_qemu_table[_ARCHITECTURE_MAX] = { + [ARCHITECTURE_ARM64] = "aarch64", /* differs from our name */ + [ARCHITECTURE_ARM] = "arm", + [ARCHITECTURE_ALPHA] = "alpha", + [ARCHITECTURE_X86_64] = "x86_64", /* differs from our name */ + [ARCHITECTURE_X86] = "i386", /* differs from our name */ + [ARCHITECTURE_LOONGARCH64] = "loongarch64", + [ARCHITECTURE_MIPS64_LE] = "mips", /* differs from our name */ + [ARCHITECTURE_MIPS_LE] = "mips", /* differs from our name */ + [ARCHITECTURE_PARISC] = "hppa", /* differs from our name */ + [ARCHITECTURE_PPC64_LE] = "ppc", /* differs from our name */ + [ARCHITECTURE_PPC64] = "ppc", /* differs from our name */ + [ARCHITECTURE_PPC] = "ppc", + [ARCHITECTURE_RISCV32] = "riscv32", + [ARCHITECTURE_RISCV64] = "riscv64", + [ARCHITECTURE_S390X] = "s390x", + }; + + FOREACH_STRING(s, "qemu", "qemu-kvm") { + r = find_executable(s, ret_qemu_binary); + if (r == 0) + return 0; + + if (r != -ENOENT) + return r; + } + + const char *arch_qemu = architecture_to_qemu_table[native_architecture()]; + if (!arch_qemu) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Architecture %s not supported by qemu", architecture_to_string(native_architecture())); + + _cleanup_free_ char *qemu_arch_specific = NULL; + qemu_arch_specific = strjoin("qemu-system-", arch_qemu); + if (!qemu_arch_specific) + return -ENOMEM; + + return find_executable(qemu_arch_specific, ret_qemu_binary); +} + +int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock) { + /* this is an arbitrary value picked from /dev/urandom */ + static const uint8_t sip_key[HASH_KEY_SIZE] = { + 0x03, 0xad, 0xf0, 0xa4, + 0x59, 0x2c, 0x77, 0x11, + 0xda, 0x39, 0x0c, 0xba, + 0xf5, 0x4c, 0x80, 0x52 + }; + struct siphash machine_hash_state, state; + _cleanup_close_ int vfd = -EBADF; + int r; + + /* uint64_t is required here for the ioctl call, but valid CIDs are only 32 bits */ + uint64_t cid = *ASSERT_PTR(machine_cid); + + assert(machine); + assert(ret_child_sock); + + /* Fix the CID of the AF_VSOCK socket passed to qemu + * + * If the user has passed us a CID (machine_cid != VMADDR_CID_ANY), then attempt to bind to that CID + * and error if we cannot. + * + * Otherwise hash the machine name to get a random CID and attempt to bind to that. + * If it is occupied add more information into the hash and try again. + * If after 64 attempts this hasn't worked fallback to truly random CIDs. + * If after another 64 attempts this hasn't worked then give up and return EADDRNOTAVAIL. + */ + + /* remove O_CLOEXEC before this fd is passed to QEMU */ + vfd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); + if (vfd < 0) + return log_debug_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m"); + + if (cid != VMADDR_CID_ANY) { + r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + if (r < 0) + return log_debug_errno(errno, "Failed to set CID for child vsock with user provided CID %" PRIu64 ": %m", cid); + *ret_child_sock = TAKE_FD(vfd); + return 0; + } + + siphash24_init(&machine_hash_state, sip_key); + siphash24_compress_string(machine, &machine_hash_state); + for (unsigned i = 0; i < 64; i++) { + state = machine_hash_state; + siphash24_compress_safe(&i, sizeof i, &state); + uint64_t hash = siphash24_finalize(&state); + + cid = 3 + (hash % (UINT_MAX - 4)); + r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + if (r >= 0) { + *machine_cid = cid; + *ret_child_sock = TAKE_FD(vfd); + return 0; + } + if (errno != EADDRINUSE) + return -errno; + } + + for (unsigned i = 0; i < 64; i++) { + cid = 3 + random_u64_range(UINT_MAX - 4); + r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + if (r >= 0) { + *machine_cid = cid; + *ret_child_sock = TAKE_FD(vfd); + return 0; + } + + if (errno != EADDRINUSE) + return -errno; + } + + return log_debug_errno(SYNTHETIC_ERRNO(EADDRNOTAVAIL), "Failed to assign a CID to the guest vsock"); +} diff --git a/src/vmspawn/vmspawn-util.h b/src/vmspawn/vmspawn-util.h new file mode 100644 index 0000000..53ad7dd --- /dev/null +++ b/src/vmspawn/vmspawn-util.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include "macro.h" + +#if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__) +#define ARCHITECTURE_SUPPORTS_SMBIOS 1 +#else +#define ARCHITECTURE_SUPPORTS_SMBIOS 0 +#endif + +typedef struct OvmfConfig { + char *path; + char *vars; + bool supports_sb; +} OvmfConfig; + +OvmfConfig* ovmf_config_free(OvmfConfig *ovmf_config); +DEFINE_TRIVIAL_CLEANUP_FUNC(OvmfConfig*, ovmf_config_free); + +int qemu_check_kvm_support(void); +int qemu_check_vsock_support(void); +int find_ovmf_config(int search_sb, OvmfConfig **ret_ovmf_config); +int find_qemu_binary(char **ret_qemu_binary); +int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock); diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c new file mode 100644 index 0000000..ebae681 --- /dev/null +++ b/src/vmspawn/vmspawn.c @@ -0,0 +1,766 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <getopt.h> +#include <stdint.h> +#include <stdlib.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "architecture.h" +#include "build.h" +#include "common-signal.h" +#include "copy.h" +#include "creds-util.h" +#include "escape.h" +#include "fileio.h" +#include "format-util.h" +#include "fs-util.h" +#include "hexdecoct.h" +#include "hostname-util.h" +#include "log.h" +#include "machine-credential.h" +#include "main-func.h" +#include "pager.h" +#include "parse-argument.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "sd-event.h" +#include "signal-util.h" +#include "socket-util.h" +#include "strv.h" +#include "tmpfile-util.h" +#include "vmspawn-settings.h" +#include "vmspawn-util.h" + +static PagerFlags arg_pager_flags = 0; +static char *arg_image = NULL; +static char *arg_machine = NULL; +static char *arg_qemu_smp = NULL; +static uint64_t arg_qemu_mem = 2ULL * 1024ULL * 1024ULL * 1024ULL; +static int arg_qemu_kvm = -1; +static int arg_qemu_vsock = -1; +static uint64_t arg_vsock_cid = UINT64_MAX; +static bool arg_qemu_gui = false; +static int arg_secure_boot = -1; +static MachineCredential *arg_credentials = NULL; +static size_t arg_n_credentials = 0; +static SettingsMask arg_settings_mask = 0; +static char **arg_parameters = NULL; + +STATIC_DESTRUCTOR_REGISTER(arg_image, freep); +STATIC_DESTRUCTOR_REGISTER(arg_machine, freep); +STATIC_DESTRUCTOR_REGISTER(arg_qemu_smp, freep); +STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + pager_open(arg_pager_flags); + + r = terminal_urlify_man("systemd-vmspawn", "1", &link); + if (r < 0) + return log_oom(); + + printf("%1$s [OPTIONS...] [ARGUMENTS...]\n\n" + "%5$sSpawn a command or OS in a virtual machine.%6$s\n\n" + " -h --help Show this help\n" + " --version Print version string\n" + " --no-pager Do not pipe output into a pager\n\n" + "%3$sImage:%4$s\n" + " -i --image=PATH Root file system disk image (or device node) for\n" + " the virtual machine\n\n" + "%3$sHost Configuration:%4$s\n" + " --qemu-smp=SMP Configure guest's SMP settings\n" + " --qemu-mem=MEM Configure guest's RAM size\n" + " --qemu-kvm=BOOL Configure whether to use KVM or not\n" + " --qemu-vsock=BOOL Configure whether to use qemu with a vsock or not\n" + " --vsock-cid= Specify the CID to use for the qemu guest's vsock\n" + " --qemu-gui Start QEMU in graphical mode\n" + " --secure-boot=BOOL Configure whether to search for firmware which\n" + " supports Secure Boot\n\n" + "%3$sSystem Identity:%4$s\n" + " -M --machine=NAME Set the machine name for the container\n" + "%3$sCredentials:%4$s\n" + " --set-credential=ID:VALUE\n" + " Pass a credential with literal value to container.\n" + " --load-credential=ID:PATH\n" + " Load credential to pass to container from file or\n" + " AF_UNIX stream socket.\n" + "\nSee the %2$s for details.\n", + program_invocation_short_name, + link, + ansi_underline(), + ansi_normal(), + ansi_highlight(), + ansi_normal()); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + enum { + ARG_VERSION = 0x100, + ARG_NO_PAGER, + ARG_QEMU_SMP, + ARG_QEMU_MEM, + ARG_QEMU_KVM, + ARG_QEMU_VSOCK, + ARG_VSOCK_CID, + ARG_QEMU_GUI, + ARG_SECURE_BOOT, + ARG_SET_CREDENTIAL, + ARG_LOAD_CREDENTIAL, + }; + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "image", required_argument, NULL, 'i' }, + { "machine", required_argument, NULL, 'M' }, + { "qemu-smp", required_argument, NULL, ARG_QEMU_SMP }, + { "qemu-mem", required_argument, NULL, ARG_QEMU_MEM }, + { "qemu-kvm", required_argument, NULL, ARG_QEMU_KVM }, + { "qemu-vsock", required_argument, NULL, ARG_QEMU_VSOCK }, + { "vsock-cid", required_argument, NULL, ARG_VSOCK_CID }, + { "qemu-gui", no_argument, NULL, ARG_QEMU_GUI }, + { "secure-boot", required_argument, NULL, ARG_SECURE_BOOT }, + { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, + { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + optind = 0; + while ((c = getopt_long(argc, argv, "+hi:M", options, NULL)) >= 0) + switch (c) { + case 'h': + return help(); + + case ARG_VERSION: + return version(); + + case 'i': + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + + case 'M': + if (isempty(optarg)) + arg_machine = mfree(arg_machine); + else { + if (!hostname_is_valid(optarg, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "Invalid machine name: %s", optarg); + + r = free_and_strdup(&arg_machine, optarg); + if (r < 0) + return log_oom(); + } + break; + + case ARG_NO_PAGER: + arg_pager_flags |= PAGER_DISABLE; + break; + + case ARG_QEMU_SMP: + r = free_and_strdup_warn(&arg_qemu_smp, optarg); + if (r < 0) + return r; + break; + + case ARG_QEMU_MEM: + r = parse_size(optarg, 1024, &arg_qemu_mem); + if (r < 0) + return log_error_errno(r, "Failed to parse --qemu-mem=%s: %m", optarg); + break; + + case ARG_QEMU_KVM: + r = parse_tristate(optarg, &arg_qemu_kvm); + if (r < 0) + return log_error_errno(r, "Failed to parse --qemu-kvm=%s: %m", optarg); + break; + + case ARG_QEMU_VSOCK: + r = parse_tristate(optarg, &arg_qemu_vsock); + if (r < 0) + return log_error_errno(r, "Failed to parse --qemu-vsock=%s: %m", optarg); + break; + + case ARG_VSOCK_CID: { + unsigned cid; + if (isempty(optarg)) + cid = VMADDR_CID_ANY; + else { + r = safe_atou_bounded(optarg, 3, UINT_MAX - 1, &cid); + if (r == -ERANGE) + return log_error_errno(r, "Invalid value for --vsock-cid=: %m"); + if (r < 0) + return log_error_errno(r, "Failed to parse --vsock-cid=%s: %m", optarg); + } + arg_vsock_cid = (uint64_t)cid; + break; + } + + case ARG_QEMU_GUI: + arg_qemu_gui = true; + break; + + case ARG_SECURE_BOOT: + r = parse_tristate(optarg, &arg_secure_boot); + if (r < 0) + return log_error_errno(r, "Failed to parse --secure-boot=%s: %m", optarg); + break; + + case ARG_SET_CREDENTIAL: { + r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg); + if (r < 0) + return r; + arg_settings_mask |= SETTING_CREDENTIALS; + break; + } + + case ARG_LOAD_CREDENTIAL: { + r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_CREDENTIALS; + break; + } + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (argc > optind) { + strv_free(arg_parameters); + arg_parameters = strv_copy(argv + optind); + if (!arg_parameters) + return log_oom(); + + arg_settings_mask |= SETTING_START_MODE; + } + + return 1; +} + +static int open_vsock(void) { + _cleanup_close_ int vsock_fd = -EBADF; + int r; + static const union sockaddr_union bind_addr = { + .vm.svm_family = AF_VSOCK, + .vm.svm_cid = VMADDR_CID_ANY, + .vm.svm_port = VMADDR_PORT_ANY, + }; + + vsock_fd = socket(AF_VSOCK, SOCK_STREAM|SOCK_CLOEXEC, 0); + if (vsock_fd < 0) + return log_error_errno(errno, "Failed to open AF_VSOCK socket: %m"); + + r = bind(vsock_fd, &bind_addr.sa, sizeof(bind_addr.vm)); + if (r < 0) + return log_error_errno(errno, "Failed to bind to vsock to address %u:%u: %m", bind_addr.vm.svm_cid, bind_addr.vm.svm_port); + + r = listen(vsock_fd, SOMAXCONN_DELUXE); + if (r < 0) + return log_error_errno(errno, "Failed to listen on vsock: %m"); + + return TAKE_FD(vsock_fd); +} + +static int vmspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + char buf[NOTIFY_BUFFER_MAX+1]; + const char *p = NULL; + struct iovec iovec = { + .iov_base = buf, + .iov_len = sizeof(buf)-1, + }; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + }; + ssize_t n; + _cleanup_strv_free_ char **tags = NULL; + int r, *exit_status = ASSERT_PTR(userdata); + + n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; + if (n == -EXFULL) { + log_warning_errno(n, "Got message with truncated control data, ignoring: %m"); + return 0; + } + if (n < 0) + return log_warning_errno(n, "Couldn't read notification socket: %m"); + + if ((size_t) n >= sizeof(buf)) { + log_warning("Received notify message exceeded maximum size. Ignoring."); + return 0; + } + + buf[n] = 0; + tags = strv_split(buf, "\n\r"); + if (!tags) + return log_oom(); + + STRV_FOREACH(s, tags) + log_debug("Received tag %s from notify socket", *s); + + if (strv_contains(tags, "READY=1")) { + r = sd_notify(false, "READY=1\n"); + if (r < 0) + log_warning_errno(r, "Failed to send readiness notification, ignoring: %m"); + } + + p = strv_find_startswith(tags, "STATUS="); + if (p) + (void) sd_notifyf(false, "STATUS=VM running: %s", p); + + p = strv_find_startswith(tags, "EXIT_STATUS="); + if (p) { + r = safe_atoi(p, exit_status); + if (r < 0) + log_warning_errno(r, "Failed to parse exit status from %s, ignoring: %m", p); + } + + /* we will only receive one message from each connection so disable this source once one is received */ + source = sd_event_source_disable_unref(source); + + return 0; +} + +static int vmspawn_dispatch_vsock_connections(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + int r; + sd_event *event; + _cleanup_close_ int conn_fd = -EBADF; + + assert(userdata); + + if (revents != EPOLLIN) { + log_warning("Got unexpected poll event for vsock fd."); + return 0; + } + + conn_fd = accept4(fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK); + if (conn_fd < 0) { + log_warning_errno(errno, "Failed to accept connection from vsock fd (%m), ignoring..."); + return 0; + } + + event = sd_event_source_get_event(source); + if (!event) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to retrieve event from event source, exiting task"); + + /* add a new floating task to read from the connection */ + r = sd_event_add_io(event, NULL, conn_fd, revents, vmspawn_dispatch_notify_fd, userdata); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify connection event source: %m"); + + /* conn_fd is now owned by the event loop so don't clean it up */ + TAKE_FD(conn_fd); + + return 0; +} + +static int setup_notify_parent(sd_event *event, int fd, int *exit_status, sd_event_source **notify_event_source) { + int r; + + r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, vmspawn_dispatch_vsock_connections, exit_status); + if (r < 0) + return log_error_errno(r, "Failed to allocate notify socket event source: %m"); + + (void) sd_event_source_set_description(*notify_event_source, "vmspawn-notify-sock"); + + return 0; +} + +static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + pid_t pid; + + pid = PTR_TO_PID(userdata); + if (pid > 0) { + /* TODO: actually talk to qemu and ask the guest to shutdown here */ + if (kill(pid, SIGKILL) >= 0) { + log_info("Trying to halt qemu. Send SIGTERM again to trigger vmspawn to immediately terminate."); + sd_event_source_set_userdata(s, NULL); + return 0; + } + } + + sd_event_exit(sd_event_source_get_event(s), 0); + return 0; +} + +static int on_child_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { + sd_event_exit(sd_event_source_get_event(s), 0); + return 0; +} + +static int cmdline_add_vsock(char ***cmdline, int vsock_fd) { + int r; + + r = strv_extend(cmdline, "-smbios"); + if (r < 0) + return r; + + union sockaddr_union addr; + socklen_t addr_len = sizeof addr.vm; + r = getsockname(vsock_fd, &addr.sa, &addr_len); + if (r < 0) + return -errno; + assert(addr_len >= sizeof addr.vm); + assert(addr.vm.svm_family == AF_VSOCK); + + log_info("Using vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port); + r = strv_extendf(cmdline, "type=11,value=io.systemd.credential:vmm.notify_socket=vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port); + if (r < 0) + return r; + + return 0; +} + +static int run_virtual_machine(void) { + _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL; + _cleanup_strv_free_ char **cmdline = NULL; + _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL; + int r; + _cleanup_close_ int vsock_fd = -EBADF; + + bool use_kvm = arg_qemu_kvm > 0; + if (arg_qemu_kvm < 0) { + r = qemu_check_kvm_support(); + if (r < 0) + return log_error_errno(r, "Failed to check for KVM support: %m"); + use_kvm = r; + } + + r = find_ovmf_config(arg_secure_boot, &ovmf_config); + if (r < 0) + return log_error_errno(r, "Failed to find OVMF config: %m"); + + /* only warn if the user hasn't disabled secureboot */ + if (!ovmf_config->supports_sb && arg_secure_boot) + log_warning("Couldn't find OVMF firmware blob with Secure Boot support, " + "falling back to OVMF firmware blobs without Secure Boot support."); + + const char *accel = use_kvm ? "kvm" : "tcg"; + if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE)) + machine = strjoin("type=virt,accel=", accel); + else + machine = strjoin("type=q35,accel=", accel, ",smm=", on_off(ovmf_config->supports_sb)); + if (!machine) + return log_oom(); + + r = find_qemu_binary(&qemu_binary); + if (r == -EOPNOTSUPP) + return log_error_errno(r, "Native architecture is not supported by qemu."); + if (r < 0) + return log_error_errno(r, "Failed to find QEMU binary: %m"); + + if (asprintf(&mem, "%.4fM", (double)arg_qemu_mem / (1024.0 * 1024.0)) < 0) + return log_oom(); + + cmdline = strv_new( + qemu_binary, + "-machine", machine, + "-smp", arg_qemu_smp ?: "1", + "-m", mem, + "-object", "rng-random,filename=/dev/urandom,id=rng0", + "-device", "virtio-rng-pci,rng=rng0,id=rng-device0", + "-nic", "user,model=virtio-net-pci" + ); + if (!cmdline) + return log_oom(); + + bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS; + if (arg_qemu_vsock < 0) { + r = qemu_check_vsock_support(); + if (r < 0) + return log_error_errno(r, "Failed to check for VSock support: %m"); + + use_vsock = r; + } + + unsigned child_cid = VMADDR_CID_ANY; + _cleanup_close_ int child_vsock_fd = -EBADF; + if (use_vsock) { + if (arg_vsock_cid < UINT_MAX) + child_cid = (unsigned)arg_vsock_cid; + + r = vsock_fix_child_cid(&child_cid, arg_machine, &child_vsock_fd); + if (r < 0) + return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m"); + + r = strv_extend(&cmdline, "-device"); + if (r < 0) + return log_oom(); + + log_debug("vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); + r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); + if (r < 0) + return log_oom(); + } + + r = strv_extend_strv(&cmdline, STRV_MAKE("-cpu", "max"), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + + if (arg_qemu_gui) { + r = strv_extend_strv(&cmdline, STRV_MAKE("-vga", "virtio"), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + } else { + r = strv_extend_strv(&cmdline, STRV_MAKE( + "-nographic", + "-nodefaults", + "-chardev", "stdio,mux=on,id=console,signal=off", + "-serial", "chardev:console", + "-mon", "console" + ), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + } + + if (ARCHITECTURE_SUPPORTS_SMBIOS) { + ssize_t n; + FOREACH_ARRAY(cred, arg_credentials, arg_n_credentials) { + _cleanup_free_ char *cred_data_b64 = NULL; + + n = base64mem(cred->data, cred->size, &cred_data_b64); + if (n < 0) + return log_oom(); + + r = strv_extend(&cmdline, "-smbios"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "type=11,value=io.systemd.credential.binary:%s=%s", cred->id, cred_data_b64); + if (r < 0) + return log_oom(); + } + } + + r = strv_extend(&cmdline, "-drive"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "if=pflash,format=raw,readonly=on,file=%s", ovmf_config->path); + if (r < 0) + return log_oom(); + + _cleanup_(unlink_and_freep) char *ovmf_vars_to = NULL; + if (ovmf_config->supports_sb) { + const char *ovmf_vars_from = ovmf_config->vars; + _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF; + + r = tempfn_random_child(NULL, "vmspawn-", &ovmf_vars_to); + if (r < 0) + return r; + + source_fd = open(ovmf_vars_from, O_RDONLY|O_CLOEXEC); + if (source_fd < 0) + return log_error_errno(source_fd, "Failed to open OVMF vars file %s: %m", ovmf_vars_from); + + target_fd = open(ovmf_vars_to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600); + if (target_fd < 0) + return log_error_errno(errno, "Failed to create regular file for OVMF vars at %s: %m", ovmf_vars_to); + + r = copy_bytes(source_fd, target_fd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return log_error_errno(r, "Failed to copy bytes from %s to %s: %m", ovmf_vars_from, ovmf_vars_to); + + /* These aren't always available so don't raise an error if they fail */ + (void) copy_xattr(source_fd, NULL, target_fd, NULL, 0); + (void) copy_access(source_fd, target_fd); + (void) copy_times(source_fd, target_fd, 0); + + r = strv_extend_strv(&cmdline, STRV_MAKE( + "-global", "ICH9-LPC.disable_s3=1", + "-global", "driver=cfi.pflash01,property=secure,value=on", + "-drive" + ), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "file=%s,if=pflash,format=raw", ovmf_vars_to); + if (r < 0) + return log_oom(); + } + + r = strv_extend(&cmdline, "-drive"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "if=none,id=mkosi,file=%s,format=raw", arg_image); + if (r < 0) + return log_oom(); + + r = strv_extend_strv(&cmdline, STRV_MAKE( + "-device", "virtio-scsi-pci,id=scsi", + "-device", "scsi-hd,drive=mkosi,bootindex=1" + ), /* filter_duplicates= */ false); + if (r < 0) + return log_oom(); + + if (!strv_isempty(arg_parameters)) { + if (ARCHITECTURE_SUPPORTS_SMBIOS) { + _cleanup_free_ char *kcl = strv_join(arg_parameters, " "); + if (!kcl) + return log_oom(); + + r = strv_extend(&cmdline, "-smbios"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "type=11,value=io.systemd.stub.kernel-cmdline-extra=%s", kcl); + if (r < 0) + return log_oom(); + } else + log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS"); + } + + if (use_vsock) { + vsock_fd = open_vsock(); + if (vsock_fd < 0) + return log_error_errno(vsock_fd, "Failed to open vsock: %m"); + + r = cmdline_add_vsock(&cmdline, vsock_fd); + if (r == -ENOMEM) + return log_oom(); + if (r < 0) + return log_error_errno(r, "Failed to call getsockname on vsock: %m"); + } + + _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; + _cleanup_(sd_event_unrefp) sd_event *event = NULL; + r = sd_event_new(&event); + if (r < 0) + return log_error_errno(r, "Failed to get default event source: %m"); + + (void) sd_event_set_watchdog(event, true); + + pid_t child_pid; + r = safe_fork_full( + qemu_binary, + NULL, + &child_vsock_fd, 1, /* pass the vsock fd to qemu */ + FORK_CLOEXEC_OFF, + &child_pid); + if (r < 0) + return log_error_errno(r, "Failed to fork off %s: %m", qemu_binary); + if (r == 0) { + /* set TERM and LANG if they are missing */ + if (setenv("TERM", "vt220", 0) < 0) + return log_oom(); + + if (setenv("LANG", "C.UTF-8", 0) < 0) + return log_oom(); + + execve(qemu_binary, cmdline, environ); + log_error_errno(errno, "Failed to execve %s: %m", qemu_binary); + _exit(EXIT_FAILURE); + } + + + int exit_status = INT_MAX; + if (use_vsock) { + r = setup_notify_parent(event, vsock_fd, &exit_status, ¬ify_event_source); + if (r < 0) + return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m"); + } + + /* shutdown qemu when we are shutdown */ + (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(child_pid)); + (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(child_pid)); + + (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + + /* Exit when the child exits */ + (void) sd_event_add_child(event, NULL, child_pid, WEXITED, on_child_exit, NULL); + + r = sd_event_loop(event); + if (r < 0) + return log_error_errno(r, "Failed to run event loop: %m"); + + if (use_vsock) { + if (exit_status == INT_MAX) { + log_debug("Couldn't retrieve inner EXIT_STATUS from vsock"); + return EXIT_SUCCESS; + } + if (exit_status != 0) + log_warning("Non-zero exit code received: %d", exit_status); + return exit_status; + } + + return 0; +} + +static int determine_names(void) { + int r; + + if (!arg_image) + return log_error_errno(SYNTHETIC_ERRNO(-EINVAL), "Missing required argument -i/--image=, quitting"); + + if (!arg_machine) { + char *e; + + r = path_extract_filename(arg_image, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image); + + /* Truncate suffix if there is one */ + e = endswith(arg_machine, ".raw"); + if (e) + *e = 0; + + hostname_cleanup(arg_machine); + if (!hostname_is_valid(arg_machine, 0)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M."); + } + + return 0; +} + +static int run(int argc, char *argv[]) { + int r, ret = EXIT_SUCCESS; + + log_setup(); + + r = parse_argv(argc, argv); + if (r <= 0) + goto finish; + + r = determine_names(); + if (r < 0) + goto finish; + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + + r = run_virtual_machine(); + if (r > 0) + ret = r; +finish: + machine_credential_free_all(arg_credentials, arg_n_credentials); + + if (r < 0) + return r; + + return ret; +} + +DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); |