diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 03:50:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-06-12 03:50:45 +0000 |
commit | efeb864cb547a2cbf96dc0053a8bdb4d9190b364 (patch) | |
tree | c0b83368f18be983fcc763200c4c24d633244588 /src/vmspawn | |
parent | Releasing progress-linux version 255.5-1~progress7.99u1. (diff) | |
download | systemd-efeb864cb547a2cbf96dc0053a8bdb4d9190b364.tar.xz systemd-efeb864cb547a2cbf96dc0053a8bdb4d9190b364.zip |
Merging upstream version 256.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/vmspawn')
-rw-r--r-- | src/vmspawn/meson.build | 14 | ||||
-rw-r--r-- | src/vmspawn/test-vmspawn-util.c | 28 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-mount.c | 67 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-mount.h | 19 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-register.c | 86 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-register.h | 15 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-scope.c | 310 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-scope.h | 23 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-settings.c | 10 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-settings.h | 17 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-util.c | 369 | ||||
-rw-r--r-- | src/vmspawn/vmspawn-util.h | 73 | ||||
-rw-r--r-- | src/vmspawn/vmspawn.c | 1960 |
13 files changed, 2676 insertions, 315 deletions
diff --git a/src/vmspawn/meson.build b/src/vmspawn/meson.build index 800d7c3..3cd9a3b 100644 --- a/src/vmspawn/meson.build +++ b/src/vmspawn/meson.build @@ -3,6 +3,9 @@ libvmspawn_core_sources = files( 'vmspawn-settings.c', 'vmspawn-util.c', + 'vmspawn-scope.c', + 'vmspawn-mount.c', + 'vmspawn-register.c', ) libvmspawn_core = static_library( 'vmspawn-core', @@ -16,6 +19,10 @@ vmspawn_libs = [ libshared, ] +vmspawn_test_template = test_template + { + 'link_with' : [vmspawn_libs], +} + executables += [ executable_template + { 'name' : 'systemd-vmspawn', @@ -23,5 +30,10 @@ executables += [ 'conditions': ['ENABLE_VMSPAWN'], 'sources' : files('vmspawn.c'), 'link_with' : vmspawn_libs, - } + 'dependencies' : [libblkid] + }, + vmspawn_test_template + { + 'conditions': ['ENABLE_VMSPAWN'], + 'sources' : files('test-vmspawn-util.c'), + }, ] diff --git a/src/vmspawn/test-vmspawn-util.c b/src/vmspawn/test-vmspawn-util.c new file mode 100644 index 0000000..67e5c4c --- /dev/null +++ b/src/vmspawn/test-vmspawn-util.c @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdbool.h> + +#include "alloc-util.h" +#include "string-util.h" +#include "vmspawn-util.h" +#include "tests.h" + +#define _ESCAPE_QEMU_VALUE_CHECK(str, correct, varname) \ + do { \ + _cleanup_free_ char* varname = NULL; \ + varname = escape_qemu_value(str); \ + assert(varname); \ + assert_se(streq(varname, correct)); \ + } while (0) + +#define ESCAPE_QEMU_VALUE_CHECK(str, correct) \ + _ESCAPE_QEMU_VALUE_CHECK(str, correct, conf##__COUNTER__) + +TEST(escape_qemu_value) { + ESCAPE_QEMU_VALUE_CHECK("abcde", "abcde"); + ESCAPE_QEMU_VALUE_CHECK("a,bcde", "a,,bcde"); + ESCAPE_QEMU_VALUE_CHECK(",,,", ",,,,,,"); + ESCAPE_QEMU_VALUE_CHECK("", ""); +} + +DEFINE_TEST_MAIN(LOG_INFO); diff --git a/src/vmspawn/vmspawn-mount.c b/src/vmspawn/vmspawn-mount.c new file mode 100644 index 0000000..ee63bda --- /dev/null +++ b/src/vmspawn/vmspawn-mount.c @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "alloc-util.h" +#include "extract-word.h" +#include "macro.h" +#include "parse-argument.h" +#include "path-util.h" +#include "string-util.h" +#include "vmspawn-mount.h" + +static void runtime_mount_done(RuntimeMount *mount) { + assert(mount); + + mount->source = mfree(mount->source); + mount->target = mfree(mount->target); +} + +void runtime_mount_context_done(RuntimeMountContext *ctx) { + assert(ctx); + + FOREACH_ARRAY(mount, ctx->mounts, ctx->n_mounts) + runtime_mount_done(mount); + + free(ctx->mounts); +} + +int runtime_mount_parse(RuntimeMountContext *ctx, const char *s, bool read_only) { + _cleanup_(runtime_mount_done) RuntimeMount mount = { .read_only = read_only }; + _cleanup_free_ char *source_rel = NULL; + int r; + + assert(ctx); + + r = extract_first_word(&s, &source_rel, ":", EXTRACT_DONT_COALESCE_SEPARATORS); + if (r < 0) + return r; + if (r == 0) + return -EINVAL; + + if (isempty(source_rel)) + return -EINVAL; + + r = path_make_absolute_cwd(source_rel, &mount.source); + if (r < 0) + return r; + + /* virtiofsd only supports directories */ + r = is_dir(mount.source, /* follow= */ true); + if (r < 0) + return r; + if (!r) + return -ENOTDIR; + + mount.target = s ? strdup(s) : TAKE_PTR(source_rel); + if (!mount.target) + return -ENOMEM; + + if (!path_is_absolute(mount.target)) + return -EINVAL; + + if (!GREEDY_REALLOC(ctx->mounts, ctx->n_mounts + 1)) + return log_oom(); + + ctx->mounts[ctx->n_mounts++] = TAKE_STRUCT(mount); + + return 0; +} diff --git a/src/vmspawn/vmspawn-mount.h b/src/vmspawn/vmspawn-mount.h new file mode 100644 index 0000000..2ea24fd --- /dev/null +++ b/src/vmspawn/vmspawn-mount.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> +#include <stddef.h> + +typedef struct RuntimeMount { + bool read_only; + char *source; + char *target; +} RuntimeMount; + +typedef struct RuntimeMountContext { + RuntimeMount *mounts; + size_t n_mounts; +} RuntimeMountContext; + +void runtime_mount_context_done(RuntimeMountContext *ctx); +int runtime_mount_parse(RuntimeMountContext *ctx, const char *s, bool read_only); diff --git a/src/vmspawn/vmspawn-register.c b/src/vmspawn/vmspawn-register.c new file mode 100644 index 0000000..42650b8 --- /dev/null +++ b/src/vmspawn/vmspawn-register.c @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" +#include "sd-id128.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "json.h" +#include "macro.h" +#include "process-util.h" +#include "socket-util.h" +#include "string-util.h" +#include "varlink.h" +#include "vmspawn-register.h" + +int register_machine( + sd_bus *bus, + const char *machine_name, + sd_id128_t uuid, + const char *service, + const char *directory, + unsigned cid, + const char *address, + const char *key_path) { + + _cleanup_(varlink_unrefp) Varlink *vl = NULL; + int r; + + assert(machine_name); + assert(service); + + /* First try to use varlink, as it provides more features (such as SSH support). */ + r = varlink_connect_address(&vl, "/run/systemd/machine/io.systemd.Machine"); + if (r == -ENOENT || ERRNO_IS_DISCONNECT(r)) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + + assert(bus); + + /* In case we are running with an older machined, fallback to the existing D-Bus method. */ + r = bus_call_method( + bus, + bus_machine_mgr, + "RegisterMachine", + &error, + NULL, + "sayssus", + machine_name, + SD_BUS_MESSAGE_APPEND_ID128(uuid), + service, + "vm", + (uint32_t) getpid_cached(), + strempty(directory)); + if (r < 0) + return log_error_errno(r, "Failed to register machine: %s", bus_error_message(&error, r)); + + return 0; + } + if (r < 0) + return log_error_errno(r, "Failed to connect to machined on /run/systemd/machine/io.systemd.Machine: %m"); + + return varlink_callb_and_log(vl, + "io.systemd.Machine.Register", + NULL, + JSON_BUILD_OBJECT( + JSON_BUILD_PAIR_STRING("name", machine_name), + JSON_BUILD_PAIR_CONDITION(!sd_id128_is_null(uuid), "id", JSON_BUILD_ID128(uuid)), + JSON_BUILD_PAIR_STRING("service", service), + JSON_BUILD_PAIR_STRING("class", "vm"), + JSON_BUILD_PAIR_CONDITION(VSOCK_CID_IS_REGULAR(cid), "vSockCid", JSON_BUILD_UNSIGNED(cid)), + JSON_BUILD_PAIR_CONDITION(directory, "rootDirectory", JSON_BUILD_STRING(directory)), + JSON_BUILD_PAIR_CONDITION(address, "sshAddress", JSON_BUILD_STRING(address)), + JSON_BUILD_PAIR_CONDITION(key_path, "sshPrivateKeyPath", JSON_BUILD_STRING(key_path)))); +} + +int unregister_machine(sd_bus *bus, const char *machine_name) { + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + int r; + + assert(bus); + + r = bus_call_method(bus, bus_machine_mgr, "UnregisterMachine", &error, NULL, "s", machine_name); + if (r < 0) + log_debug("Failed to unregister machine: %s", bus_error_message(&error, r)); + + return 0; +} diff --git a/src/vmspawn/vmspawn-register.h b/src/vmspawn/vmspawn-register.h new file mode 100644 index 0000000..69f5671 --- /dev/null +++ b/src/vmspawn/vmspawn-register.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "sd-bus.h" +#include "sd-id128.h" + +int register_machine( + sd_bus *bus, + const char *machine_name, + sd_id128_t uuid, + const char *service, + const char *directory, + unsigned cid, + const char *address, + const char *key_path); +int unregister_machine(sd_bus *bus, const char *machine_name); diff --git a/src/vmspawn/vmspawn-scope.c b/src/vmspawn/vmspawn-scope.c new file mode 100644 index 0000000..58f6781 --- /dev/null +++ b/src/vmspawn/vmspawn-scope.c @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdio.h> + +#include "sd-bus.h" + +#include "bus-error.h" +#include "bus-locator.h" +#include "bus-unit-util.h" +#include "bus-util.h" +#include "bus-wait-for-jobs.h" +#include "escape.h" +#include "macro.h" +#include "process-util.h" +#include "random-util.h" +#include "socket-util.h" +#include "strv.h" +#include "unit-def.h" +#include "unit-name.h" +#include "vmspawn-scope.h" + +int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL, *m = NULL; + _cleanup_free_ char *scope = NULL, *description = NULL; + const char *object; + int r; + + assert(bus); + assert(machine_name); + + /* Creates a transient scope unit which tracks the lifetime of the current process */ + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch job: %m"); + + if (asprintf(&scope, "machine-%"PRIu64"-%s.scope", random_u64(), machine_name) < 0) + return log_oom(); + + description = strjoin("Virtual Machine ", machine_name); + if (!description) + return log_oom(); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "ss", /* name */ scope, /* mode */ "fail"); + if (r < 0) + return bus_log_create_error(r); + + /* Properties */ + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)(sv)(sv)", + "Description", "s", description, + "AddRef", "b", 1, + "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return bus_log_create_error(r); + + _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; + r = pidref_set_self(&pidref); + if (r < 0) + return log_error_errno(r, "Failed to allocate PID reference: %m"); + + r = bus_append_scope_pidref(m, &pidref, allow_pidfd); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + /* No auxiliary units */ + r = sd_bus_message_append( + m, + "a(sa(sv))", + 0); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + /* If this failed with a property we couldn't write, this is quite likely because the server + * doesn't support PIDFDs yet, let's try without. */ + if (allow_pidfd && + sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY)) + return start_transient_scope(bus, machine_name, false, ret_scope); + + return log_error_errno(r, "Failed to start transient scope unit: %s", bus_error_message(&error, r)); + } + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, object, /* quiet */ false, NULL); + if (r < 0) + return r; + + if (ret_scope) + *ret_scope = TAKE_PTR(scope); + + return 0; +} + +static int message_add_commands(sd_bus_message *m, const char *exec_type, char ***commands, size_t n_commands) { + int r; + + assert(m); + assert(exec_type); + assert(commands || n_commands == 0); + + /* A small helper for adding an ExecStart / ExecStopPost / etc.. property to an sd_bus_message */ + + r = sd_bus_message_open_container(m, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", exec_type); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'v', "a(sasb)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sasb)"); + if (r < 0) + return bus_log_create_error(r); + + FOREACH_ARRAY(cmd, commands, n_commands) { + char **cmdline = *cmd; + + r = sd_bus_message_open_container(m, 'r', "sasb"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", cmdline[0]); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append_strv(m, cmdline); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "b", 0); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +void socket_service_pair_done(SocketServicePair *p) { + assert(p); + + p->exec_start_pre = strv_free(p->exec_start_pre); + p->exec_start = strv_free(p->exec_start); + p->exec_stop_post = strv_free(p->exec_stop_post); + p->unit_name_prefix = mfree(p->unit_name_prefix); + p->runtime_directory = mfree(p->runtime_directory); + p->listen_address = mfree(p->listen_address); + p->socket_type = 0; +} + +int start_socket_service_pair(sd_bus *bus, const char *scope, SocketServicePair *p) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_free_ char *service_desc = NULL, *service_name = NULL, *socket_name = NULL; + const char *object, *socket_type_str; + int r; + + /* Starts a socket/service unit pair bound to the given scope. */ + + assert(bus); + assert(scope); + assert(p); + assert(p->unit_name_prefix); + assert(p->exec_start); + assert(p->listen_address); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch job: %m"); + + socket_name = strjoin(p->unit_name_prefix, ".socket"); + if (!socket_name) + return log_oom(); + + service_name = strjoin(p->unit_name_prefix, ".service"); + if (!service_name) + return log_oom(); + + service_desc = quote_command_line(p->exec_start, SHELL_ESCAPE_EMPTY); + if (!service_desc) + return log_oom(); + + socket_type_str = socket_address_type_to_string(p->socket_type); + if (!socket_type_str) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Invalid socket type: %d", p->socket_type); + + r = bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "ssa(sv)", + /* ss - name, mode */ + socket_name, "fail", + /* a(sv) - Properties */ + 5, + "Description", "s", p->listen_address, + "AddRef", "b", 1, + "BindsTo", "as", 1, scope, + "Listen", "a(ss)", 1, socket_type_str, p->listen_address, + "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return bus_log_create_error(r); + + /* aux */ + r = sd_bus_message_open_container(m, 'a', "(sa(sv))"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'r', "sa(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "s", service_name); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(m, 'a', "(sv)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(m, "(sv)(sv)(sv)(sv)", + "Description", "s", service_desc, + "AddRef", "b", 1, + "BindsTo", "as", 1, scope, + "CollectMode", "s", "inactive-or-failed"); + if (r < 0) + return bus_log_create_error(r); + + if (p->runtime_directory) { + r = sd_bus_message_append(m, "(sv)", "RuntimeDirectory", "as", 1, p->runtime_directory); + if (r < 0) + return bus_log_create_error(r); + } + + if (p->exec_start_pre) { + r = message_add_commands(m, "ExecStartPre", &p->exec_start_pre, 1); + if (r < 0) + return r; + } + + r = message_add_commands(m, "ExecStart", &p->exec_start, 1); + if (r < 0) + return r; + + if (p->exec_stop_post) { + r = message_add_commands(m, "ExecStopPost", &p->exec_stop_post, 1); + if (r < 0) + return r; + } + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(m); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) + return log_error_errno(r, "Failed to start %s as transient unit: %s", p->exec_start[0], bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &object); + if (r < 0) + return bus_log_parse_error(r); + + return bus_wait_for_jobs_one(w, object, /* quiet */ false, NULL); +} diff --git a/src/vmspawn/vmspawn-scope.h b/src/vmspawn/vmspawn-scope.h new file mode 100644 index 0000000..74c7511 --- /dev/null +++ b/src/vmspawn/vmspawn-scope.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <stdbool.h> + +#include "sd-bus.h" + +#include "macro.h" + +typedef struct SocketServicePair { + char **exec_start_pre; + char **exec_start; + char **exec_stop_post; + char *unit_name_prefix; + char *runtime_directory; + char *listen_address; + int socket_type; +} SocketServicePair; + +void socket_service_pair_done(SocketServicePair *p); + +int start_transient_scope(sd_bus *bus, const char *machine_name, bool allow_pidfd, char **ret_scope); +int start_socket_service_pair(sd_bus *bus, const char *scope, SocketServicePair *p); diff --git a/src/vmspawn/vmspawn-settings.c b/src/vmspawn/vmspawn-settings.c index cb1a463..780df55 100644 --- a/src/vmspawn/vmspawn-settings.c +++ b/src/vmspawn/vmspawn-settings.c @@ -1,3 +1,13 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include "string-table.h" #include "vmspawn-settings.h" + +static const char *const console_mode_table[_CONSOLE_MODE_MAX] = { + [CONSOLE_INTERACTIVE] = "interactive", + [CONSOLE_READ_ONLY] = "read-only", + [CONSOLE_NATIVE] = "native", + [CONSOLE_GUI] = "gui", +}; + +DEFINE_STRING_TABLE_LOOKUP(console_mode, ConsoleMode); diff --git a/src/vmspawn/vmspawn-settings.h b/src/vmspawn/vmspawn-settings.h index 268a874..5446c20 100644 --- a/src/vmspawn/vmspawn-settings.h +++ b/src/vmspawn/vmspawn-settings.h @@ -1,11 +1,28 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once +#include <errno.h> #include <stdint.h> +#include "macro.h" + +typedef enum ConsoleMode { + CONSOLE_INTERACTIVE, /* ptyfwd */ + CONSOLE_READ_ONLY, /* ptyfwd, but in read-only mode */ + CONSOLE_NATIVE, /* qemu's native TTY handling */ + CONSOLE_GUI, /* qemu's graphical UI */ + _CONSOLE_MODE_MAX, + _CONSOLE_MODE_INVALID = -EINVAL, +} ConsoleMode; + typedef enum SettingsMask { SETTING_START_MODE = UINT64_C(1) << 0, + SETTING_MACHINE_ID = UINT64_C(1) << 6, + SETTING_BIND_MOUNTS = UINT64_C(1) << 11, SETTING_DIRECTORY = UINT64_C(1) << 26, SETTING_CREDENTIALS = UINT64_C(1) << 30, _SETTING_FORCE_ENUM_WIDTH = UINT64_MAX } SettingsMask; + +const char *console_mode_to_string(ConsoleMode m) _const_; +ConsoleMode console_mode_from_string(const char *s) _pure_; diff --git a/src/vmspawn/vmspawn-util.c b/src/vmspawn/vmspawn-util.c index b5b5eaf..472dd92 100644 --- a/src/vmspawn/vmspawn-util.c +++ b/src/vmspawn/vmspawn-util.c @@ -7,6 +7,7 @@ #include "architecture.h" #include "conf-files.h" #include "errno-util.h" +#include "escape.h" #include "fd-util.h" #include "fileio.h" #include "json.h" @@ -20,19 +21,53 @@ #include "siphash24.h" #include "socket-util.h" #include "sort-util.h" +#include "string-table.h" #include "string-util.h" #include "strv.h" #include "vmspawn-util.h" +static const char* const architecture_to_qemu_table[_ARCHITECTURE_MAX] = { + [ARCHITECTURE_ARM64] = "aarch64", /* differs from our name */ + [ARCHITECTURE_ARM] = "arm", + [ARCHITECTURE_ALPHA] = "alpha", + [ARCHITECTURE_X86_64] = "x86_64", /* differs from our name */ + [ARCHITECTURE_X86] = "i386", /* differs from our name */ + [ARCHITECTURE_LOONGARCH64] = "loongarch64", + [ARCHITECTURE_MIPS64_LE] = "mips", /* differs from our name */ + [ARCHITECTURE_MIPS_LE] = "mips", /* differs from our name */ + [ARCHITECTURE_PARISC] = "hppa", /* differs from our name */ + [ARCHITECTURE_PPC64_LE] = "ppc", /* differs from our name */ + [ARCHITECTURE_PPC64] = "ppc", /* differs from our name */ + [ARCHITECTURE_PPC] = "ppc", + [ARCHITECTURE_RISCV32] = "riscv32", + [ARCHITECTURE_RISCV64] = "riscv64", + [ARCHITECTURE_S390X] = "s390x", +}; + +static int native_arch_as_qemu(const char **ret) { + const char *s = architecture_to_qemu_table[native_architecture()]; + if (!s) + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Architecture %s not supported by qemu", architecture_to_string(native_architecture())); + + if (ret) + *ret = s; + + return 0; +} + OvmfConfig* ovmf_config_free(OvmfConfig *config) { if (!config) return NULL; free(config->path); + free(config->format); free(config->vars); + free(config->vars_format); return mfree(config); } +DEFINE_STRING_TABLE_LOOKUP(network_stack, NetworkStack); + int qemu_check_kvm_support(void) { if (access("/dev/kvm", F_OK) >= 0) return true; @@ -40,7 +75,7 @@ int qemu_check_kvm_support(void) { log_debug_errno(errno, "/dev/kvm not found. Not using KVM acceleration."); return false; } - if (errno == EPERM) { + if (ERRNO_IS_PRIVILEGE(errno)) { log_debug_errno(errno, "Permission denied to access /dev/kvm. Not using KVM acceleration."); return false; } @@ -62,11 +97,11 @@ int qemu_check_vsock_support(void) { fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); if (fd >= 0) return true; - if (errno == ENODEV) { + if (ERRNO_IS_DEVICE_ABSENT(errno)) { log_debug_errno(errno, "/dev/vhost-vsock device doesn't exist. Not adding a vsock device to the virtual machine."); return false; } - if (errno == EPERM) { + if (ERRNO_IS_PRIVILEGE(errno)) { log_debug_errno(errno, "Permission denied to access /dev/vhost-vsock. Not adding a vsock device to the virtual machine."); return false; } @@ -78,16 +113,28 @@ int qemu_check_vsock_support(void) { typedef struct FirmwareData { char **features; char *firmware; + char *firmware_format; char *vars; + char *vars_format; + char **architectures; } FirmwareData; +static bool firmware_data_supports_sb(const FirmwareData *fwd) { + assert(fwd); + + return strv_contains(fwd->features, "secure-boot"); +} + static FirmwareData* firmware_data_free(FirmwareData *fwd) { if (!fwd) return NULL; - fwd->features = strv_free(fwd->features); - fwd->firmware = mfree(fwd->firmware); - fwd->vars = mfree(fwd->vars); + strv_free(fwd->features); + free(fwd->firmware); + free(fwd->firmware_format); + free(fwd->vars); + free(fwd->vars_format); + strv_free(fwd->architectures); return mfree(fwd); } @@ -95,22 +142,22 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(FirmwareData*, firmware_data_free); static int firmware_executable(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { static const JsonDispatch table[] = { - { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, firmware), JSON_MANDATORY }, - { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, firmware), JSON_MANDATORY }, + { "format", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, firmware_format), JSON_MANDATORY }, {} }; - return json_dispatch(v, table, 0, userdata); + return json_dispatch(v, table, flags, userdata); } static int firmware_nvram_template(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { static const JsonDispatch table[] = { - { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, vars), JSON_MANDATORY }, - { "format", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "filename", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, vars), JSON_MANDATORY }, + { "format", JSON_VARIANT_STRING, json_dispatch_string, offsetof(FirmwareData, vars_format), JSON_MANDATORY }, {} }; - return json_dispatch(v, table, 0, userdata); + return json_dispatch(v, table, flags, userdata); } static int firmware_mapping(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { @@ -121,15 +168,170 @@ static int firmware_mapping(const char *name, JsonVariant *v, JsonDispatchFlags {} }; - return json_dispatch(v, table, 0, userdata); + return json_dispatch(v, table, flags, userdata); +} + +static int target_architecture(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + int r; + JsonVariant *e; + char ***supported_architectures = ASSERT_PTR(userdata); + + static const JsonDispatch table[] = { + { "architecture", JSON_VARIANT_STRING, json_dispatch_string, 0, JSON_MANDATORY }, + { "machines", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + {} + }; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *arch = NULL; + + r = json_dispatch(e, table, flags, &arch); + if (r < 0) + return r; + + r = strv_consume(supported_architectures, TAKE_PTR(arch)); + if (r < 0) + return r; + } + + return 0; +} + +static int get_firmware_search_dirs(char ***ret) { + int r; + + assert(ret); + + /* Search in: + * - $XDG_CONFIG_HOME/qemu/firmware + * - /etc/qemu/firmware + * - /usr/share/qemu/firmware + * + * Prioritising entries in "more specific" directories */ + + _cleanup_free_ char *user_firmware_dir = NULL; + r = xdg_user_config_dir(&user_firmware_dir, "/qemu/firmware"); + if (r < 0) + return r; + + _cleanup_strv_free_ char **l = NULL; + l = strv_new(user_firmware_dir, "/etc/qemu/firmware", "/usr/share/qemu/firmware"); + if (!l) + return log_oom_debug(); + + *ret = TAKE_PTR(l); + return 0; +} + +int list_ovmf_config(char ***ret) { + _cleanup_strv_free_ char **search_dirs = NULL; + int r; + + assert(ret); + + r = get_firmware_search_dirs(&search_dirs); + if (r < 0) + return r; + + r = conf_files_list_strv( + ret, + ".json", + /* root= */ NULL, + CONF_FILES_FILTER_MASKED|CONF_FILES_REGULAR, + (const char *const*) search_dirs); + if (r < 0) + return log_debug_errno(r, "Failed to list firmware files: %m"); + + return 0; +} + +static int load_firmware_data(const char *path, FirmwareData **ret) { + int r; + + assert(path); + assert(ret); + + _cleanup_(json_variant_unrefp) JsonVariant *json = NULL; + r = json_parse_file( + /* f= */ NULL, + path, + /* flags= */ 0, + &json, + /* ret_line= */ NULL, + /* ret_column= */ NULL); + if (r < 0) + return r; + + static const JsonDispatch table[] = { + { "description", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "interface-types", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + { "mapping", JSON_VARIANT_OBJECT, firmware_mapping, 0, JSON_MANDATORY }, + { "targets", JSON_VARIANT_ARRAY, target_architecture, offsetof(FirmwareData, architectures), JSON_MANDATORY }, + { "features", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(FirmwareData, features), JSON_MANDATORY }, + { "tags", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, + {} + }; + + _cleanup_(firmware_data_freep) FirmwareData *fwd = NULL; + fwd = new0(FirmwareData, 1); + if (!fwd) + return -ENOMEM; + + r = json_dispatch(json, table, JSON_ALLOW_EXTENSIONS, fwd); + if (r < 0) + return r; + + *ret = TAKE_PTR(fwd); + return 0; +} + +static int ovmf_config_make(FirmwareData *fwd, OvmfConfig **ret) { + assert(fwd); + assert(ret); + + _cleanup_free_ OvmfConfig *config = NULL; + config = new(OvmfConfig, 1); + if (!config) + return -ENOMEM; + + *config = (OvmfConfig) { + .path = TAKE_PTR(fwd->firmware), + .format = TAKE_PTR(fwd->firmware_format), + .vars = TAKE_PTR(fwd->vars), + .vars_format = TAKE_PTR(fwd->vars_format), + .supports_sb = firmware_data_supports_sb(fwd), + }; + + *ret = TAKE_PTR(config); + return 0; +} + +int load_ovmf_config(const char *path, OvmfConfig **ret) { + _cleanup_(firmware_data_freep) FirmwareData *fwd = NULL; + int r; + + assert(path); + assert(ret); + + r = load_firmware_data(path, &fwd); + if (r < 0) + return r; + + return ovmf_config_make(fwd, ret); } int find_ovmf_config(int search_sb, OvmfConfig **ret) { _cleanup_(ovmf_config_freep) OvmfConfig *config = NULL; - _cleanup_free_ char *user_firmware_dir = NULL; _cleanup_strv_free_ char **conf_files = NULL; + const char* native_arch_qemu; int r; + assert(ret); + + r = native_arch_as_qemu(&native_arch_qemu); + if (r < 0) + return r; + /* Search in: * - $XDG_CONFIG_HOME/qemu/firmware * - /etc/qemu/firmware @@ -138,74 +340,40 @@ int find_ovmf_config(int search_sb, OvmfConfig **ret) { * Prioritising entries in "more specific" directories */ - r = xdg_user_config_dir(&user_firmware_dir, "/qemu/firmware"); + r = list_ovmf_config(&conf_files); if (r < 0) return r; - r = conf_files_list_strv(&conf_files, ".json", NULL, CONF_FILES_FILTER_MASKED|CONF_FILES_REGULAR, - STRV_MAKE_CONST(user_firmware_dir, "/etc/qemu/firmware", "/usr/share/qemu/firmware")); - if (r < 0) - return log_debug_errno(r, "Failed to list config files: %m"); - STRV_FOREACH(file, conf_files) { _cleanup_(firmware_data_freep) FirmwareData *fwd = NULL; - _cleanup_(json_variant_unrefp) JsonVariant *config_json = NULL; - _cleanup_free_ char *contents = NULL; - size_t contents_sz = 0; - r = read_full_file(*file, &contents, &contents_sz); - if (r == -ENOMEM) - return r; + r = load_firmware_data(*file, &fwd); if (r < 0) { - log_debug_errno(r, "Failed to read contents of %s - ignoring: %m", *file); + log_debug_errno(r, "Failed to load JSON file '%s', skipping: %m", *file); continue; } - r = json_parse(contents, 0, &config_json, NULL, NULL); - if (r == -ENOMEM) - return r; - if (r < 0) { - log_debug_errno(r, "Failed to parse the JSON in %s - ignoring: %m", *file); + if (strv_contains(fwd->features, "enrolled-keys")) { + log_debug("Skipping %s, firmware has enrolled keys which has been known to cause issues.", *file); continue; } - static const JsonDispatch table[] = { - { "description", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, - { "interface-types", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, - { "mapping", JSON_VARIANT_OBJECT, firmware_mapping, 0, JSON_MANDATORY }, - { "targets", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, - { "features", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(FirmwareData, features), JSON_MANDATORY }, - { "tags", JSON_VARIANT_ARRAY, NULL, 0, JSON_MANDATORY }, - {} - }; - - fwd = new0(FirmwareData, 1); - if (!fwd) - return -ENOMEM; - - r = json_dispatch(config_json, table, 0, fwd); - if (r == -ENOMEM) - return r; - if (r < 0) { - log_debug_errno(r, "Failed to extract the required fields from the JSON in %s - ignoring: %m", *file); + if (!strv_contains(fwd->architectures, native_arch_qemu)) { + log_debug("Skipping %s, firmware doesn't support the native architecture.", *file); continue; } - int sb_present = !!strv_find(fwd->features, "secure-boot"); - /* exclude firmware which doesn't match our Secure Boot requirements */ - if (search_sb >= 0 && search_sb != sb_present) { - log_debug("Skipping %s, firmware doesn't fit required Secure Boot configuration", *file); + if (search_sb >= 0 && !!search_sb != firmware_data_supports_sb(fwd)) { + log_debug("Skipping %s, firmware doesn't fit required Secure Boot configuration.", *file); continue; } - config = new0(OvmfConfig, 1); - if (!config) - return -ENOMEM; + r = ovmf_config_make(fwd, &config); + if (r < 0) + return r; - config->path = TAKE_PTR(fwd->firmware); - config->vars = TAKE_PTR(fwd->vars); - config->supports_sb = sb_present; + log_debug("Selected firmware definition %s.", *file); break; } @@ -219,6 +387,7 @@ int find_ovmf_config(int search_sb, OvmfConfig **ret) { } int find_qemu_binary(char **ret_qemu_binary) { + const char *native_arch_qemu; int r; /* @@ -228,24 +397,6 @@ int find_qemu_binary(char **ret_qemu_binary) { * If the native architecture is not supported by qemu -EOPNOTSUPP will be returned; */ - static const char *architecture_to_qemu_table[_ARCHITECTURE_MAX] = { - [ARCHITECTURE_ARM64] = "aarch64", /* differs from our name */ - [ARCHITECTURE_ARM] = "arm", - [ARCHITECTURE_ALPHA] = "alpha", - [ARCHITECTURE_X86_64] = "x86_64", /* differs from our name */ - [ARCHITECTURE_X86] = "i386", /* differs from our name */ - [ARCHITECTURE_LOONGARCH64] = "loongarch64", - [ARCHITECTURE_MIPS64_LE] = "mips", /* differs from our name */ - [ARCHITECTURE_MIPS_LE] = "mips", /* differs from our name */ - [ARCHITECTURE_PARISC] = "hppa", /* differs from our name */ - [ARCHITECTURE_PPC64_LE] = "ppc", /* differs from our name */ - [ARCHITECTURE_PPC64] = "ppc", /* differs from our name */ - [ARCHITECTURE_PPC] = "ppc", - [ARCHITECTURE_RISCV32] = "riscv32", - [ARCHITECTURE_RISCV64] = "riscv64", - [ARCHITECTURE_S390X] = "s390x", - }; - FOREACH_STRING(s, "qemu", "qemu-kvm") { r = find_executable(s, ret_qemu_binary); if (r == 0) @@ -255,19 +406,19 @@ int find_qemu_binary(char **ret_qemu_binary) { return r; } - const char *arch_qemu = architecture_to_qemu_table[native_architecture()]; - if (!arch_qemu) - return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Architecture %s not supported by qemu", architecture_to_string(native_architecture())); + r = native_arch_as_qemu(&native_arch_qemu); + if (r < 0) + return r; _cleanup_free_ char *qemu_arch_specific = NULL; - qemu_arch_specific = strjoin("qemu-system-", arch_qemu); + qemu_arch_specific = strjoin("qemu-system-", native_arch_qemu); if (!qemu_arch_specific) return -ENOMEM; return find_executable(qemu_arch_specific, ret_qemu_binary); } -int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock) { +int vsock_fix_child_cid(int vhost_device_fd, unsigned *machine_cid, const char *machine) { /* this is an arbitrary value picked from /dev/urandom */ static const uint8_t sip_key[HASH_KEY_SIZE] = { 0x03, 0xad, 0xf0, 0xa4, @@ -276,14 +427,13 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi 0xf5, 0x4c, 0x80, 0x52 }; struct siphash machine_hash_state, state; - _cleanup_close_ int vfd = -EBADF; int r; /* uint64_t is required here for the ioctl call, but valid CIDs are only 32 bits */ uint64_t cid = *ASSERT_PTR(machine_cid); assert(machine); - assert(ret_child_sock); + assert(vhost_device_fd >= 0); /* Fix the CID of the AF_VSOCK socket passed to qemu * @@ -296,16 +446,10 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi * If after another 64 attempts this hasn't worked then give up and return EADDRNOTAVAIL. */ - /* remove O_CLOEXEC before this fd is passed to QEMU */ - vfd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); - if (vfd < 0) - return log_debug_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m"); - if (cid != VMADDR_CID_ANY) { - r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid); if (r < 0) return log_debug_errno(errno, "Failed to set CID for child vsock with user provided CID %" PRIu64 ": %m", cid); - *ret_child_sock = TAKE_FD(vfd); return 0; } @@ -317,10 +461,9 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi uint64_t hash = siphash24_finalize(&state); cid = 3 + (hash % (UINT_MAX - 4)); - r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid); if (r >= 0) { *machine_cid = cid; - *ret_child_sock = TAKE_FD(vfd); return 0; } if (errno != EADDRINUSE) @@ -329,10 +472,9 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi for (unsigned i = 0; i < 64; i++) { cid = 3 + random_u64_range(UINT_MAX - 4); - r = ioctl(vfd, VHOST_VSOCK_SET_GUEST_CID, &cid); + r = ioctl(vhost_device_fd, VHOST_VSOCK_SET_GUEST_CID, &cid); if (r >= 0) { *machine_cid = cid; - *ret_child_sock = TAKE_FD(vfd); return 0; } @@ -342,3 +484,36 @@ int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_chi return log_debug_errno(SYNTHETIC_ERRNO(EADDRNOTAVAIL), "Failed to assign a CID to the guest vsock"); } + +char* escape_qemu_value(const char *s) { + const char *f; + char *e, *t; + size_t n; + + assert(s); + + /* QEMU requires that commas in arguments to be escaped by doubling up the commas. See + * https://www.qemu.org/docs/master/system/qemu-manpage.html#options for more information. + * + * This function performs this escaping, returning an allocated string with the escaped value, or + * NULL if allocation failed. */ + + n = strlen(s); + + if (n > (SIZE_MAX - 1) / 2) + return NULL; + + e = new(char, n*2 + 1); + if (!e) + return NULL; + + for (f = s, t = e; f < s + n; f++) { + *t++ = *f; + if (*f == ',') + *t++ = ','; + } + + *t = 0; + + return e; +} diff --git a/src/vmspawn/vmspawn-util.h b/src/vmspawn/vmspawn-util.h index 53ad7dd..fed0996 100644 --- a/src/vmspawn/vmspawn-util.h +++ b/src/vmspawn/vmspawn-util.h @@ -5,22 +5,87 @@ #include "macro.h" #if defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__) -#define ARCHITECTURE_SUPPORTS_SMBIOS 1 +# define ARCHITECTURE_SUPPORTS_SMBIOS 1 #else -#define ARCHITECTURE_SUPPORTS_SMBIOS 0 +# define ARCHITECTURE_SUPPORTS_SMBIOS 0 +#endif + +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) +# define ARCHITECTURE_SUPPORTS_TPM 1 +#else +# define ARCHITECTURE_SUPPORTS_TPM 0 +#endif + +#if defined(__x86_64__) || defined(__i386__) +# define ARCHITECTURE_SUPPORTS_SMM 1 +#else +# define ARCHITECTURE_SUPPORTS_SMM 0 +#endif + +#if defined(__arm__) || defined(__aarch64__) +# define DEFAULT_SERIAL_TTY "ttyAMA0" +#elif defined(__s390__) || defined(__s390x__) +# define DEFAULT_SERIAL_TTY "ttysclp0" +#elif defined(__powerpc__) || defined(__powerpc64__) +# define DEFAULT_SERIAL_TTY "hvc0" +#else +# define DEFAULT_SERIAL_TTY "ttyS0" +#endif + +#if defined(__x86_64__) || defined(__i386__) +# define QEMU_MACHINE_TYPE "q35" +#elif defined(__arm__) || defined(__aarch64__) +# define QEMU_MACHINE_TYPE "virt" +#elif defined(__s390__) || defined(__s390x__) +# define QEMU_MACHINE_TYPE "s390-ccw-virtio" +#elif defined(__powerpc__) || defined(__powerpc64__) +# define QEMU_MACHINE_TYPE "pseries" +#else +# error "No qemu machine defined for this architecture" #endif typedef struct OvmfConfig { char *path; + char *format; char *vars; + char *vars_format; bool supports_sb; } OvmfConfig; +static inline const char *ovmf_config_format(const OvmfConfig *c) { + return ASSERT_PTR(c)->format ?: "raw"; +} + +static inline const char *ovmf_config_vars_format(const OvmfConfig *c) { + return ASSERT_PTR(c)->vars_format ?: "raw"; +} + OvmfConfig* ovmf_config_free(OvmfConfig *ovmf_config); DEFINE_TRIVIAL_CLEANUP_FUNC(OvmfConfig*, ovmf_config_free); +typedef enum NetworkStack { + NETWORK_STACK_TAP, + NETWORK_STACK_USER, + NETWORK_STACK_NONE, + _NETWORK_STACK_MAX, + _NETWORK_STACK_INVALID = -EINVAL, +} NetworkStack; + +static const char* const network_stack_table[_NETWORK_STACK_MAX] = { + [NETWORK_STACK_TAP] = "tap", + [NETWORK_STACK_USER] = "user", + [NETWORK_STACK_NONE] = "none", +}; + +const char* network_stack_to_string(NetworkStack type) _const_; +NetworkStack network_stack_from_string(const char *s) _pure_; + int qemu_check_kvm_support(void); int qemu_check_vsock_support(void); -int find_ovmf_config(int search_sb, OvmfConfig **ret_ovmf_config); +int list_ovmf_config(char ***ret); +int load_ovmf_config(const char *path, OvmfConfig **ret); +int find_ovmf_config(int search_sb, OvmfConfig **ret); int find_qemu_binary(char **ret_qemu_binary); -int vsock_fix_child_cid(unsigned *machine_cid, const char *machine, int *ret_child_sock); +int vsock_fix_child_cid(int vsock_fd, unsigned *machine_cid, const char *machine); + +char* escape_qemu_value(const char *s); diff --git a/src/vmspawn/vmspawn.c b/src/vmspawn/vmspawn.c index ebae681..326722d 100644 --- a/src/vmspawn/vmspawn.c +++ b/src/vmspawn/vmspawn.c @@ -1,59 +1,136 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ +#include <net/if.h> +#include <linux/if.h> #include <getopt.h> #include <stdint.h> +#include <stdio.h> #include <stdlib.h> -#include <sys/wait.h> +#include <string.h> +#include <sys/stat.h> #include <unistd.h> +#include "sd-daemon.h" +#include "sd-event.h" +#include "sd-id128.h" + #include "alloc-util.h" #include "architecture.h" +#include "bootspec.h" #include "build.h" +#include "bus-internal.h" +#include "bus-locator.h" +#include "bus-wait-for-jobs.h" +#include "chase.h" #include "common-signal.h" #include "copy.h" #include "creds-util.h" +#include "dirent-util.h" +#include "discover-image.h" +#include "dissect-image.h" #include "escape.h" +#include "ether-addr-util.h" +#include "event-util.h" +#include "extract-word.h" +#include "fd-util.h" #include "fileio.h" #include "format-util.h" #include "fs-util.h" +#include "gpt.h" #include "hexdecoct.h" #include "hostname-util.h" +#include "io-util.h" +#include "kernel-image.h" #include "log.h" #include "machine-credential.h" +#include "macro.h" #include "main-func.h" +#include "mkdir.h" +#include "netif-util.h" #include "pager.h" #include "parse-argument.h" #include "parse-util.h" +#include "path-lookup.h" #include "path-util.h" +#include "pidref.h" #include "pretty-print.h" #include "process-util.h" -#include "sd-event.h" +#include "ptyfwd.h" +#include "random-util.h" +#include "rm-rf.h" #include "signal-util.h" #include "socket-util.h" +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" #include "strv.h" +#include "time-util.h" #include "tmpfile-util.h" +#include "unit-name.h" +#include "vmspawn-mount.h" +#include "vmspawn-register.h" +#include "vmspawn-scope.h" #include "vmspawn-settings.h" #include "vmspawn-util.h" +#define VM_TAP_HASH_KEY SD_ID128_MAKE(01,d0,c6,4c,2b,df,24,fb,c0,f8,b2,09,7d,59,b2,93) + +typedef struct SSHInfo { + unsigned cid; + char *private_key_path; + unsigned port; +} SSHInfo; + +static bool arg_quiet = false; static PagerFlags arg_pager_flags = 0; +static char *arg_directory = NULL; static char *arg_image = NULL; static char *arg_machine = NULL; -static char *arg_qemu_smp = NULL; -static uint64_t arg_qemu_mem = 2ULL * 1024ULL * 1024ULL * 1024ULL; -static int arg_qemu_kvm = -1; -static int arg_qemu_vsock = -1; -static uint64_t arg_vsock_cid = UINT64_MAX; -static bool arg_qemu_gui = false; +static char *arg_cpus = NULL; +static uint64_t arg_ram = UINT64_C(2) * U64_GB; +static int arg_kvm = -1; +static int arg_vsock = -1; +static unsigned arg_vsock_cid = VMADDR_CID_ANY; +static int arg_tpm = -1; +static char *arg_linux = NULL; +static char **arg_initrds = NULL; +static ConsoleMode arg_console_mode = CONSOLE_INTERACTIVE; +static NetworkStack arg_network_stack = NETWORK_STACK_NONE; static int arg_secure_boot = -1; -static MachineCredential *arg_credentials = NULL; -static size_t arg_n_credentials = 0; +static MachineCredentialContext arg_credentials = {}; +static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; +static RuntimeMountContext arg_runtime_mounts = {}; static SettingsMask arg_settings_mask = 0; -static char **arg_parameters = NULL; - +static char *arg_firmware = NULL; +static char *arg_runtime_directory = NULL; +static char *arg_forward_journal = NULL; +static bool arg_runtime_directory_created = false; +static bool arg_privileged = false; +static bool arg_register = false; +static sd_id128_t arg_uuid = {}; +static char **arg_kernel_cmdline_extra = NULL; +static char **arg_extra_drives = NULL; +static char *arg_background = NULL; +static bool arg_pass_ssh_key = true; +static char *arg_ssh_key_type = NULL; +static bool arg_discard_disk = true; +struct ether_addr arg_network_provided_mac = {}; + +STATIC_DESTRUCTOR_REGISTER(arg_directory, freep); STATIC_DESTRUCTOR_REGISTER(arg_image, freep); STATIC_DESTRUCTOR_REGISTER(arg_machine, freep); -STATIC_DESTRUCTOR_REGISTER(arg_qemu_smp, freep); -STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_cpus, freep); +STATIC_DESTRUCTOR_REGISTER(arg_runtime_directory, freep); +STATIC_DESTRUCTOR_REGISTER(arg_credentials, machine_credential_context_done); +STATIC_DESTRUCTOR_REGISTER(arg_firmware, freep); +STATIC_DESTRUCTOR_REGISTER(arg_linux, freep); +STATIC_DESTRUCTOR_REGISTER(arg_initrds, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_runtime_mounts, runtime_mount_context_done); +STATIC_DESTRUCTOR_REGISTER(arg_forward_journal, freep); +STATIC_DESTRUCTOR_REGISTER(arg_kernel_cmdline_extra, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_extra_drives, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_background, freep); +STATIC_DESTRUCTOR_REGISTER(arg_ssh_key_type, freep); static int help(void) { _cleanup_free_ char *link = NULL; @@ -67,29 +144,56 @@ static int help(void) { printf("%1$s [OPTIONS...] [ARGUMENTS...]\n\n" "%5$sSpawn a command or OS in a virtual machine.%6$s\n\n" - " -h --help Show this help\n" - " --version Print version string\n" - " --no-pager Do not pipe output into a pager\n\n" - "%3$sImage:%4$s\n" - " -i --image=PATH Root file system disk image (or device node) for\n" - " the virtual machine\n\n" - "%3$sHost Configuration:%4$s\n" - " --qemu-smp=SMP Configure guest's SMP settings\n" - " --qemu-mem=MEM Configure guest's RAM size\n" - " --qemu-kvm=BOOL Configure whether to use KVM or not\n" - " --qemu-vsock=BOOL Configure whether to use qemu with a vsock or not\n" - " --vsock-cid= Specify the CID to use for the qemu guest's vsock\n" - " --qemu-gui Start QEMU in graphical mode\n" - " --secure-boot=BOOL Configure whether to search for firmware which\n" - " supports Secure Boot\n\n" - "%3$sSystem Identity:%4$s\n" - " -M --machine=NAME Set the machine name for the container\n" - "%3$sCredentials:%4$s\n" + " -h --help Show this help\n" + " --version Print version string\n" + " -q --quiet Do not show status information\n" + " --no-pager Do not pipe output into a pager\n" + "\n%3$sImage:%4$s\n" + " -D --directory=PATH Root directory for the VM\n" + " -i --image=FILE|DEVICE Root file system disk image or device for the VM\n" + "\n%3$sHost Configuration:%4$s\n" + " --cpus=CPUS Configure number of CPUs in guest\n" + " --ram=BYTES Configure guest's RAM size\n" + " --kvm=BOOL Enable use of KVM\n" + " --vsock=BOOL Override autodetection of VSOCK support\n" + " --vsock-cid=CID Specify the CID to use for the guest's VSOCK support\n" + " --tpm=BOOL Enable use of a virtual TPM\n" + " --linux=PATH Specify the linux kernel for direct kernel boot\n" + " --initrd=PATH Specify the initrd for direct kernel boot\n" + " -n --network-tap Create a TAP device for networking\n" + " --network-user-mode Use user mode networking\n" + " --secure-boot=BOOL Enable searching for firmware supporting SecureBoot\n" + " --firmware=PATH|list Select firmware definition file (or list available)\n" + " --discard-disk=BOOL Control processing of discard requests\n" + "\n%3$sSystem Identity:%4$s\n" + " -M --machine=NAME Set the machine name for the VM\n" + " --uuid=UUID Set a specific machine UUID for the VM\n" + "\n%3$sProperties:%4$s\n" + " --register=BOOLEAN Register VM with systemd-machined\n" + "\n%3$sUser Namespacing:%4$s\n" + " --private-users=UIDBASE[:NUIDS]\n" + " Configure the UID/GID range to map into the\n" + " virtiofsd namespace\n" + "\n%3$sMounts:%4$s\n" + " --bind=SOURCE[:TARGET]\n" + " Mount a file or directory from the host into the VM\n" + " --bind-ro=SOURCE[:TARGET]\n" + " Mount a file or directory, but read-only\n" + " --extra-drive=PATH Adds an additional disk to the virtual machine\n" + "\n%3$sIntegration:%4$s\n" + " --forward-journal=FILE|DIR\n" + " Forward the VM's journal to the host\n" + " --pass-ssh-key=BOOL Create an SSH key to access the VM\n" + " --ssh-key-type=TYPE Choose what type of SSH key to pass\n" + "\n%3$sInput/Output:%4$s\n" + " --console=MODE Console mode (interactive, native, gui)\n" + " --background=COLOR Set ANSI color for background\n" + "\n%3$sCredentials:%4$s\n" " --set-credential=ID:VALUE\n" - " Pass a credential with literal value to container.\n" + " Pass a credential with literal value to the VM\n" " --load-credential=ID:PATH\n" - " Load credential to pass to container from file or\n" - " AF_UNIX stream socket.\n" + " Load credential for the VM from file or AF_UNIX\n" + " stream socket.\n" "\nSee the %2$s for details.\n", program_invocation_short_name, link, @@ -101,36 +205,91 @@ static int help(void) { return 0; } +static int parse_environment(void) { + const char *e; + int r; + + e = getenv("SYSTEMD_VMSPAWN_NETWORK_MAC"); + if (e) { + r = parse_ether_addr(e, &arg_network_provided_mac); + if (r < 0) + return log_error_errno(r, "Failed to parse provided MAC address via environment variable"); + } + + return 0; +} + static int parse_argv(int argc, char *argv[]) { enum { ARG_VERSION = 0x100, ARG_NO_PAGER, - ARG_QEMU_SMP, - ARG_QEMU_MEM, - ARG_QEMU_KVM, - ARG_QEMU_VSOCK, + ARG_CPUS, + ARG_RAM, + ARG_KVM, + ARG_VSOCK, ARG_VSOCK_CID, + ARG_TPM, + ARG_LINUX, + ARG_INITRD, ARG_QEMU_GUI, + ARG_NETWORK_USER_MODE, + ARG_UUID, + ARG_REGISTER, + ARG_BIND, + ARG_BIND_RO, + ARG_EXTRA_DRIVE, ARG_SECURE_BOOT, + ARG_PRIVATE_USERS, + ARG_FORWARD_JOURNAL, + ARG_PASS_SSH_KEY, + ARG_SSH_KEY_TYPE, ARG_SET_CREDENTIAL, ARG_LOAD_CREDENTIAL, + ARG_FIRMWARE, + ARG_DISCARD_DISK, + ARG_CONSOLE, + ARG_BACKGROUND, }; static const struct option options[] = { - { "help", no_argument, NULL, 'h' }, - { "version", no_argument, NULL, ARG_VERSION }, - { "no-pager", no_argument, NULL, ARG_NO_PAGER }, - { "image", required_argument, NULL, 'i' }, - { "machine", required_argument, NULL, 'M' }, - { "qemu-smp", required_argument, NULL, ARG_QEMU_SMP }, - { "qemu-mem", required_argument, NULL, ARG_QEMU_MEM }, - { "qemu-kvm", required_argument, NULL, ARG_QEMU_KVM }, - { "qemu-vsock", required_argument, NULL, ARG_QEMU_VSOCK }, - { "vsock-cid", required_argument, NULL, ARG_VSOCK_CID }, - { "qemu-gui", no_argument, NULL, ARG_QEMU_GUI }, - { "secure-boot", required_argument, NULL, ARG_SECURE_BOOT }, - { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, - { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "quiet", no_argument, NULL, 'q' }, + { "no-pager", no_argument, NULL, ARG_NO_PAGER }, + { "image", required_argument, NULL, 'i' }, + { "directory", required_argument, NULL, 'D' }, + { "machine", required_argument, NULL, 'M' }, + { "cpus", required_argument, NULL, ARG_CPUS }, + { "qemu-smp", required_argument, NULL, ARG_CPUS }, /* Compat alias */ + { "ram", required_argument, NULL, ARG_RAM }, + { "qemu-mem", required_argument, NULL, ARG_RAM }, /* Compat alias */ + { "kvm", required_argument, NULL, ARG_KVM }, + { "qemu-kvm", required_argument, NULL, ARG_KVM }, /* Compat alias */ + { "vsock", required_argument, NULL, ARG_VSOCK }, + { "qemu-vsock", required_argument, NULL, ARG_VSOCK }, /* Compat alias */ + { "vsock-cid", required_argument, NULL, ARG_VSOCK_CID }, + { "tpm", required_argument, NULL, ARG_TPM }, + { "linux", required_argument, NULL, ARG_LINUX }, + { "initrd", required_argument, NULL, ARG_INITRD }, + { "console", required_argument, NULL, ARG_CONSOLE }, + { "qemu-gui", no_argument, NULL, ARG_QEMU_GUI }, /* compat option */ + { "network-tap", no_argument, NULL, 'n' }, + { "network-user-mode", no_argument, NULL, ARG_NETWORK_USER_MODE }, + { "uuid", required_argument, NULL, ARG_UUID }, + { "register", required_argument, NULL, ARG_REGISTER }, + { "bind", required_argument, NULL, ARG_BIND }, + { "bind-ro", required_argument, NULL, ARG_BIND_RO }, + { "extra-drive", required_argument, NULL, ARG_EXTRA_DRIVE }, + { "secure-boot", required_argument, NULL, ARG_SECURE_BOOT }, + { "private-users", required_argument, NULL, ARG_PRIVATE_USERS }, + { "forward-journal", required_argument, NULL, ARG_FORWARD_JOURNAL }, + { "pass-ssh-key", required_argument, NULL, ARG_PASS_SSH_KEY }, + { "ssh-key-type", required_argument, NULL, ARG_SSH_KEY_TYPE }, + { "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL }, + { "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL }, + { "firmware", required_argument, NULL, ARG_FIRMWARE }, + { "discard-disk", required_argument, NULL, ARG_DISCARD_DISK }, + { "background", required_argument, NULL, ARG_BACKGROUND }, {} }; @@ -140,7 +299,7 @@ static int parse_argv(int argc, char *argv[]) { assert(argv); optind = 0; - while ((c = getopt_long(argc, argv, "+hi:M", options, NULL)) >= 0) + while ((c = getopt_long(argc, argv, "+hD:i:M:nq", options, NULL)) >= 0) switch (c) { case 'h': return help(); @@ -148,6 +307,18 @@ static int parse_argv(int argc, char *argv[]) { case ARG_VERSION: return version(); + case 'q': + arg_quiet = true; + break; + + case 'D': + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_directory); + if (r < 0) + return r; + + arg_settings_mask |= SETTING_DIRECTORY; + break; + case 'i': r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_image); if (r < 0) @@ -174,57 +345,163 @@ static int parse_argv(int argc, char *argv[]) { arg_pager_flags |= PAGER_DISABLE; break; - case ARG_QEMU_SMP: - r = free_and_strdup_warn(&arg_qemu_smp, optarg); + case ARG_CPUS: + r = free_and_strdup_warn(&arg_cpus, optarg); if (r < 0) return r; break; - case ARG_QEMU_MEM: - r = parse_size(optarg, 1024, &arg_qemu_mem); + case ARG_RAM: + r = parse_size(optarg, 1024, &arg_ram); if (r < 0) - return log_error_errno(r, "Failed to parse --qemu-mem=%s: %m", optarg); + return log_error_errno(r, "Failed to parse --ram=%s: %m", optarg); break; - case ARG_QEMU_KVM: - r = parse_tristate(optarg, &arg_qemu_kvm); + case ARG_KVM: + r = parse_tristate(optarg, &arg_kvm); if (r < 0) - return log_error_errno(r, "Failed to parse --qemu-kvm=%s: %m", optarg); + return log_error_errno(r, "Failed to parse --kvm=%s: %m", optarg); break; - case ARG_QEMU_VSOCK: - r = parse_tristate(optarg, &arg_qemu_vsock); + case ARG_VSOCK: + r = parse_tristate(optarg, &arg_vsock); if (r < 0) - return log_error_errno(r, "Failed to parse --qemu-vsock=%s: %m", optarg); + return log_error_errno(r, "Failed to parse --vsock=%s: %m", optarg); break; - case ARG_VSOCK_CID: { - unsigned cid; + case ARG_VSOCK_CID: if (isempty(optarg)) - cid = VMADDR_CID_ANY; + arg_vsock_cid = VMADDR_CID_ANY; else { - r = safe_atou_bounded(optarg, 3, UINT_MAX - 1, &cid); - if (r == -ERANGE) - return log_error_errno(r, "Invalid value for --vsock-cid=: %m"); + unsigned cid; + + r = vsock_parse_cid(optarg, &cid); if (r < 0) - return log_error_errno(r, "Failed to parse --vsock-cid=%s: %m", optarg); + return log_error_errno(r, "Failed to parse --vsock-cid: %s", optarg); + if (!VSOCK_CID_IS_REGULAR(cid)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Specified CID is not regular, refusing: %u", cid); + + arg_vsock_cid = cid; } - arg_vsock_cid = (uint64_t)cid; + break; + + case ARG_TPM: + r = parse_tristate(optarg, &arg_tpm); + if (r < 0) + return log_error_errno(r, "Failed to parse --tpm=%s: %m", optarg); + break; + + case ARG_LINUX: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_linux); + if (r < 0) + return r; + break; + + case ARG_INITRD: { + _cleanup_free_ char *initrd_path = NULL; + r = parse_path_argument(optarg, /* suppress_root= */ false, &initrd_path); + if (r < 0) + return r; + + r = strv_consume(&arg_initrds, TAKE_PTR(initrd_path)); + if (r < 0) + return log_oom(); + break; } + case ARG_CONSOLE: + arg_console_mode = console_mode_from_string(optarg); + if (arg_console_mode < 0) + return log_error_errno(arg_console_mode, "Failed to parse specified console mode: %s", optarg); + + break; + case ARG_QEMU_GUI: - arg_qemu_gui = true; + arg_console_mode = CONSOLE_GUI; + break; + + case 'n': + arg_network_stack = NETWORK_STACK_TAP; break; + case ARG_NETWORK_USER_MODE: + arg_network_stack = NETWORK_STACK_USER; + break; + + case ARG_UUID: + r = id128_from_string_nonzero(optarg, &arg_uuid); + if (r == -ENXIO) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Machine UUID may not be all zeroes."); + if (r < 0) + return log_error_errno(r, "Invalid UUID: %s", optarg); + + arg_settings_mask |= SETTING_MACHINE_ID; + break; + + case ARG_REGISTER: + r = parse_boolean_argument("--register=", optarg, &arg_register); + if (r < 0) + return r; + break; + + case ARG_BIND: + case ARG_BIND_RO: + r = runtime_mount_parse(&arg_runtime_mounts, optarg, c == ARG_BIND_RO); + if (r < 0) + return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg); + + arg_settings_mask |= SETTING_BIND_MOUNTS; + break; + + case ARG_EXTRA_DRIVE: { + _cleanup_free_ char *drive_path = NULL; + + r = parse_path_argument(optarg, /* suppress_root= */ false, &drive_path); + if (r < 0) + return r; + + r = strv_consume(&arg_extra_drives, TAKE_PTR(drive_path)); + if (r < 0) + return log_oom(); + break; + } + case ARG_SECURE_BOOT: r = parse_tristate(optarg, &arg_secure_boot); if (r < 0) return log_error_errno(r, "Failed to parse --secure-boot=%s: %m", optarg); break; + case ARG_PRIVATE_USERS: + r = parse_userns_uid_range(optarg, &arg_uid_shift, &arg_uid_range); + if (r < 0) + return r; + break; + + case ARG_FORWARD_JOURNAL: + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_forward_journal); + if (r < 0) + return r; + break; + + case ARG_PASS_SSH_KEY: + r = parse_boolean_argument("--pass-ssh-key=", optarg, &arg_pass_ssh_key); + if (r < 0) + return r; + break; + + case ARG_SSH_KEY_TYPE: + if (!string_is_safe(optarg)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid value for --arg-ssh-key-type=: %s", optarg); + + r = free_and_strdup_warn(&arg_ssh_key_type, optarg); + if (r < 0) + return r; + break; + case ARG_SET_CREDENTIAL: { - r = machine_credential_set(&arg_credentials, &arg_n_credentials, optarg); + r = machine_credential_set(&arg_credentials, optarg); if (r < 0) return r; arg_settings_mask |= SETTING_CREDENTIALS; @@ -232,7 +509,7 @@ static int parse_argv(int argc, char *argv[]) { } case ARG_LOAD_CREDENTIAL: { - r = machine_credential_load(&arg_credentials, &arg_n_credentials, optarg); + r = machine_credential_load(&arg_credentials, optarg); if (r < 0) return r; @@ -240,6 +517,43 @@ static int parse_argv(int argc, char *argv[]) { break; } + case ARG_FIRMWARE: + if (streq(optarg, "list")) { + _cleanup_strv_free_ char **l = NULL; + + r = list_ovmf_config(&l); + if (r < 0) + return log_error_errno(r, "Failed to list firmwares: %m"); + + bool nl = false; + fputstrv(stdout, l, "\n", &nl); + if (nl) + putchar('\n'); + + return 0; + } + + if (!isempty(optarg) && !path_is_absolute(optarg) && !startswith(optarg, "./")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Absolute path or path starting with './' required."); + + r = parse_path_argument(optarg, /* suppress_root= */ false, &arg_firmware); + if (r < 0) + return r; + + break; + + case ARG_DISCARD_DISK: + r = parse_boolean_argument("--discard-disk=", optarg, &arg_discard_disk); + if (r < 0) + return r; + break; + + case ARG_BACKGROUND: + r = free_and_strdup_warn(&arg_background, optarg); + if (r < 0) + return r; + break; + case '?': return -EINVAL; @@ -248,9 +562,8 @@ static int parse_argv(int argc, char *argv[]) { } if (argc > optind) { - strv_free(arg_parameters); - arg_parameters = strv_copy(argv + optind); - if (!arg_parameters) + arg_kernel_cmdline_extra = strv_copy(argv + optind); + if (!arg_kernel_cmdline_extra) return log_oom(); arg_settings_mask |= SETTING_START_MODE; @@ -274,11 +587,11 @@ static int open_vsock(void) { r = bind(vsock_fd, &bind_addr.sa, sizeof(bind_addr.vm)); if (r < 0) - return log_error_errno(errno, "Failed to bind to vsock to address %u:%u: %m", bind_addr.vm.svm_cid, bind_addr.vm.svm_port); + return log_error_errno(errno, "Failed to bind to VSOCK address %u:%u: %m", bind_addr.vm.svm_cid, bind_addr.vm.svm_port); r = listen(vsock_fd, SOMAXCONN_DELUXE); if (r < 0) - return log_error_errno(errno, "Failed to listen on vsock: %m"); + return log_error_errno(errno, "Failed to listen on VSOCK: %m"); return TAKE_FD(vsock_fd); } @@ -352,13 +665,13 @@ static int vmspawn_dispatch_vsock_connections(sd_event_source *source, int fd, u assert(userdata); if (revents != EPOLLIN) { - log_warning("Got unexpected poll event for vsock fd."); + log_warning("Got unexpected poll event for VSOCK fd."); return 0; } conn_fd = accept4(fd, NULL, NULL, SOCK_CLOEXEC|SOCK_NONBLOCK); if (conn_fd < 0) { - log_warning_errno(errno, "Failed to accept connection from vsock fd (%m), ignoring..."); + log_warning_errno(errno, "Failed to accept connection from VSOCK fd (%m), ignoring..."); return 0; } @@ -377,25 +690,84 @@ static int vmspawn_dispatch_vsock_connections(sd_event_source *source, int fd, u return 0; } -static int setup_notify_parent(sd_event *event, int fd, int *exit_status, sd_event_source **notify_event_source) { +static int setup_notify_parent(sd_event *event, int fd, int *exit_status, sd_event_source **ret_notify_event_source) { int r; - r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, vmspawn_dispatch_vsock_connections, exit_status); + assert(event); + assert(fd >= 0); + assert(exit_status); + assert(ret_notify_event_source); + + r = sd_event_add_io(event, ret_notify_event_source, fd, EPOLLIN, vmspawn_dispatch_vsock_connections, exit_status); if (r < 0) return log_error_errno(r, "Failed to allocate notify socket event source: %m"); - (void) sd_event_source_set_description(*notify_event_source, "vmspawn-notify-sock"); + (void) sd_event_source_set_description(*ret_notify_event_source, "vmspawn-notify-sock"); + + return 0; +} + +static int bus_open_in_machine(sd_bus **ret, unsigned cid, unsigned port, const char *private_key_path) { + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *ssh_escaped = NULL, *bus_address = NULL; + char port_str[DECIMAL_STR_MAX(unsigned)], cid_str[DECIMAL_STR_MAX(unsigned)]; + int r; + + assert(ret); + assert(private_key_path); + + r = sd_bus_new(&bus); + if (r < 0) + return r; + + const char *ssh = secure_getenv("SYSTEMD_SSH") ?: "ssh"; + ssh_escaped = bus_address_escape(ssh); + if (!ssh_escaped) + return -ENOMEM; + + xsprintf(port_str, "%u", port); + xsprintf(cid_str, "%u", cid); + + bus_address = strjoin( + "unixexec:path=", ssh_escaped, + /* -x: Disable X11 forwarding + * -T: Disable PTY allocation */ + ",argv1=-xT", + ",argv2=-o,argv3=IdentitiesOnly yes", + ",argv4=-o,argv5=IdentityFile=", private_key_path, + ",argv6=-p,argv7=", port_str, + ",argv8=--", + ",argv9=root@vsock/", cid_str, + ",argv10=systemd-stdio-bridge" + ); + if (!bus_address) + return -ENOMEM; + free_and_replace(bus->address, bus_address); + bus->bus_client = true; + bus->trusted = true; + bus->runtime_scope = RUNTIME_SCOPE_SYSTEM; + bus->is_local = false; + + r = sd_bus_start(bus); + if (r < 0) + return r; + + *ret = TAKE_PTR(bus); return 0; } static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { - pid_t pid; + PidRef *pidref = userdata; + int r; - pid = PTR_TO_PID(userdata); - if (pid > 0) { - /* TODO: actually talk to qemu and ask the guest to shutdown here */ - if (kill(pid, SIGKILL) >= 0) { + /* Backup method to shut down the VM when D-BUS access over SSH is not available */ + + if (pidref) { + r = pidref_kill(pidref, SIGKILL); + if (r < 0) + log_warning_errno(r, "Failed to kill qemu, terminating: %m"); + else { log_info("Trying to halt qemu. Send SIGTERM again to trigger vmspawn to immediately terminate."); sd_event_source_set_userdata(s, NULL); return 0; @@ -406,6 +778,61 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo return 0; } +static int forward_signal_to_vm_pid1(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) { + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_bus_message_unrefp) sd_bus_message *reply = NULL; + SSHInfo *ssh_info = ASSERT_PTR(userdata); + const char *vm_pid1; + int r; + + assert(s); + assert(si); + + r = bus_open_in_machine(&bus, ssh_info->cid, ssh_info->port, ssh_info->private_key_path); + if (r < 0) + return log_error_errno(r, "Failed to connect to VM to forward signal: %m"); + + r = bus_wait_for_jobs_new(bus, &w); + if (r < 0) + return log_error_errno(r, "Could not watch job: %m"); + + r = bus_call_method( + bus, + bus_systemd_mgr, + "GetUnitByPID", + &error, + NULL, + ""); + if (r < 0) + return log_error_errno(r, "Failed to get init process of VM: %s", bus_error_message(&error, r)); + + r = sd_bus_message_read(reply, "o", &vm_pid1); + if (r < 0) + return bus_log_parse_error(r); + + r = bus_wait_for_jobs_one(w, vm_pid1, /* quiet */ false, NULL); + if (r < 0) + return r; + + r = bus_call_method( + bus, + bus_systemd_mgr, + "KillUnit", + &error, + NULL, + "ssi", + vm_pid1, + "leader", + si->ssi_signo); + if (r < 0) + return log_error_errno(r, "Failed to forward signal to PID 1 of the VM: %s", bus_error_message(&error, r)); + log_info("Sent signal %"PRIu32" to the VM's PID 1.", si->ssi_signo); + + return 0; +} + static int on_child_exit(sd_event_source *s, const siginfo_t *si, void *userdata) { sd_event_exit(sd_event_source_get_event(s), 0); return 0; @@ -426,7 +853,6 @@ static int cmdline_add_vsock(char ***cmdline, int vsock_fd) { assert(addr_len >= sizeof addr.vm); assert(addr.vm.svm_family == AF_VSOCK); - log_info("Using vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port); r = strv_extendf(cmdline, "type=11,value=io.systemd.credential:vmm.notify_socket=vsock-stream:%u:%u", (unsigned) VMADDR_CID_HOST, addr.vm.svm_port); if (r < 0) return r; @@ -434,22 +860,497 @@ static int cmdline_add_vsock(char ***cmdline, int vsock_fd) { return 0; } -static int run_virtual_machine(void) { +static int start_tpm( + sd_bus *bus, + const char *scope, + const char *swtpm, + char **ret_state_tempdir) { + + _cleanup_(rm_rf_physical_and_freep) char *state_dir = NULL; + _cleanup_free_ char *scope_prefix = NULL; + _cleanup_(socket_service_pair_done) SocketServicePair ssp = { + .socket_type = SOCK_STREAM, + }; + int r; + + assert(bus); + assert(scope); + assert(swtpm); + assert(ret_state_tempdir); + + r = unit_name_to_prefix(scope, &scope_prefix); + if (r < 0) + return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); + + ssp.unit_name_prefix = strjoin(scope_prefix, "-tpm"); + if (!ssp.unit_name_prefix) + return log_oom(); + + state_dir = path_join(arg_runtime_directory, ssp.unit_name_prefix); + if (!state_dir) + return log_oom(); + + if (arg_runtime_directory_created) { + ssp.runtime_directory = path_join("systemd/vmspawn", ssp.unit_name_prefix); + if (!ssp.runtime_directory) + return log_oom(); + } + + ssp.listen_address = path_join(state_dir, "sock"); + if (!ssp.listen_address) + return log_oom(); + + _cleanup_free_ char *swtpm_setup = NULL; + r = find_executable("swtpm_setup", &swtpm_setup); + if (r < 0) + return log_error_errno(r, "Failed to find swtpm_setup binary: %m"); + + ssp.exec_start_pre = strv_new(swtpm_setup, "--tpm-state", state_dir, "--tpm2", "--pcr-banks", "sha256"); + if (!ssp.exec_start_pre) + return log_oom(); + + ssp.exec_start = strv_new(swtpm, "socket", "--tpm2", "--tpmstate"); + if (!ssp.exec_start) + return log_oom(); + + r = strv_extendf(&ssp.exec_start, "dir=%s", state_dir); + if (r < 0) + return log_oom(); + + r = strv_extend_many(&ssp.exec_start, "--ctrl", "type=unixio,fd=3"); + if (r < 0) + return log_oom(); + + r = start_socket_service_pair(bus, scope, &ssp); + if (r < 0) + return r; + + *ret_state_tempdir = TAKE_PTR(state_dir); + return 0; +} + +static int start_systemd_journal_remote(sd_bus *bus, const char *scope, unsigned port, const char *sd_journal_remote, char **ret_listen_address) { + _cleanup_free_ char *scope_prefix = NULL; + _cleanup_(socket_service_pair_done) SocketServicePair ssp = { + .socket_type = SOCK_STREAM, + }; + int r; + + assert(bus); + assert(scope); + assert(sd_journal_remote); + + r = unit_name_to_prefix(scope, &scope_prefix); + if (r < 0) + return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); + + ssp.unit_name_prefix = strjoin(scope_prefix, "-forward-journal"); + if (!ssp.unit_name_prefix) + return log_oom(); + + r = asprintf(&ssp.listen_address, "vsock:2:%u", port); + if (r < 0) + return log_oom(); + + ssp.exec_start = strv_new( + sd_journal_remote, + "--output", arg_forward_journal, + "--split-mode", endswith(arg_forward_journal, ".journal") ? "none" : "host"); + if (!ssp.exec_start) + return log_oom(); + + r = start_socket_service_pair(bus, scope, &ssp); + if (r < 0) + return r; + + if (ret_listen_address) + *ret_listen_address = TAKE_PTR(ssp.listen_address); + + return 0; +} + +static int discover_root(char **ret) { + int r; + _cleanup_(dissected_image_unrefp) DissectedImage *image = NULL; + _cleanup_free_ char *root = NULL; + + assert(ret); + + r = dissect_image_file_and_warn( + arg_image, + /* verity= */ NULL, + /* mount_options= */ NULL, + /* image_policy= */ NULL, + /* flags= */ 0, + &image); + if (r < 0) + return r; + + if (image->partitions[PARTITION_ROOT].found) + root = strjoin("root=PARTUUID=", SD_ID128_TO_UUID_STRING(image->partitions[PARTITION_ROOT].uuid)); + else if (image->partitions[PARTITION_USR].found) + root = strjoin("mount.usr=PARTUUID=", SD_ID128_TO_UUID_STRING(image->partitions[PARTITION_USR].uuid)); + else + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Cannot perform a direct kernel boot without a root or usr partition, refusing"); + + if (!root) + return log_oom(); + + *ret = TAKE_PTR(root); + return 0; +} + +static int find_virtiofsd(char **ret) { + int r; + _cleanup_free_ char *virtiofsd = NULL; + + assert(ret); + + r = find_executable("virtiofsd", &virtiofsd); + if (r < 0 && r != -ENOENT) + return log_error_errno(r, "Error while searching for virtiofsd: %m"); + + if (!virtiofsd) { + FOREACH_STRING(file, "/usr/libexec/virtiofsd", "/usr/lib/virtiofsd") { + if (access(file, X_OK) >= 0) { + virtiofsd = strdup(file); + if (!virtiofsd) + return log_oom(); + break; + } + + if (!IN_SET(errno, ENOENT, EACCES)) + return log_error_errno(errno, "Error while searching for virtiofsd: %m"); + } + } + + if (!virtiofsd) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to find virtiofsd binary."); + + *ret = TAKE_PTR(virtiofsd); + return 0; +} + +static int start_virtiofsd(sd_bus *bus, const char *scope, const char *directory, bool uidmap, char **ret_state_tempdir, char **ret_sock_name) { + _cleanup_(rm_rf_physical_and_freep) char *state_dir = NULL; + _cleanup_free_ char *virtiofsd = NULL, *sock_name = NULL, *scope_prefix = NULL; + _cleanup_(socket_service_pair_done) SocketServicePair ssp = { + .socket_type = SOCK_STREAM, + }; + static unsigned virtiofsd_instance = 0; + int r; + + assert(bus); + assert(scope); + assert(directory); + assert(ret_state_tempdir); + assert(ret_sock_name); + + r = find_virtiofsd(&virtiofsd); + if (r < 0) + return r; + + r = unit_name_to_prefix(scope, &scope_prefix); + if (r < 0) + return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); + + if (asprintf(&ssp.unit_name_prefix, "%s-virtiofsd-%u", scope_prefix, virtiofsd_instance++) < 0) + return log_oom(); + + state_dir = path_join(arg_runtime_directory, ssp.unit_name_prefix); + if (!state_dir) + return log_oom(); + + if (arg_runtime_directory_created) { + ssp.runtime_directory = strjoin("systemd/vmspawn/", ssp.unit_name_prefix); + if (!ssp.runtime_directory) + return log_oom(); + } + + if (asprintf(&sock_name, "sock-%"PRIx64, random_u64()) < 0) + return log_oom(); + + ssp.listen_address = path_join(state_dir, sock_name); + if (!ssp.listen_address) + return log_oom(); + + /* QEMU doesn't support submounts so don't announce them */ + ssp.exec_start = strv_new(virtiofsd, "--shared-dir", directory, "--xattr", "--fd", "3", "--no-announce-submounts"); + if (!ssp.exec_start) + return log_oom(); + + if (uidmap && arg_uid_shift != UID_INVALID) { + r = strv_extend(&ssp.exec_start, "--uid-map"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&ssp.exec_start, ":0:" UID_FMT ":" UID_FMT ":", arg_uid_shift, arg_uid_range); + if (r < 0) + return log_oom(); + + r = strv_extend(&ssp.exec_start, "--gid-map"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&ssp.exec_start, ":0:" GID_FMT ":" GID_FMT ":", arg_uid_shift, arg_uid_range); + if (r < 0) + return log_oom(); + } + + r = start_socket_service_pair(bus, scope, &ssp); + if (r < 0) + return r; + + *ret_state_tempdir = TAKE_PTR(state_dir); + *ret_sock_name = TAKE_PTR(sock_name); + + return 0; +} + +static int kernel_cmdline_maybe_append_root(void) { + int r; + bool cmdline_contains_root = strv_find_startswith(arg_kernel_cmdline_extra, "root=") + || strv_find_startswith(arg_kernel_cmdline_extra, "mount.usr="); + + if (!cmdline_contains_root) { + _cleanup_free_ char *root = NULL; + + r = discover_root(&root); + if (r < 0) + return r; + + log_debug("Determined root file system %s from dissected image", root); + + r = strv_consume(&arg_kernel_cmdline_extra, TAKE_PTR(root)); + if (r < 0) + return log_oom(); + } + + return 0; +} + +static int discover_boot_entry(const char *root, char **ret_linux, char ***ret_initrds) { + _cleanup_(boot_config_free) BootConfig config = BOOT_CONFIG_NULL; + _cleanup_free_ char *esp_path = NULL, *xbootldr_path = NULL; + int r; + + assert(root); + assert(ret_linux); + assert(ret_initrds); + + esp_path = path_join(root, "efi"); + if (!esp_path) + return log_oom(); + + xbootldr_path = path_join(root, "boot"); + if (!xbootldr_path) + return log_oom(); + + r = boot_config_load(&config, esp_path, xbootldr_path); + if (r < 0) + return r; + + r = boot_config_select_special_entries(&config, /* skip_efivars= */ true); + if (r < 0) + return log_error_errno(r, "Failed to find special boot config entries: %m"); + + const BootEntry *boot_entry = boot_config_default_entry(&config); + + if (boot_entry && !IN_SET(boot_entry->type, BOOT_ENTRY_UNIFIED, BOOT_ENTRY_CONF)) + boot_entry = NULL; + + /* If we cannot determine a default entry search for UKIs (Type #2 EFI Unified Kernel Images) + * then .conf files (Type #1 Boot Loader Specification Entries). + * https://uapi-group.org/specifications/specs/boot_loader_specification */ + if (!boot_entry) + FOREACH_ARRAY(entry, config.entries, config.n_entries) + if (entry->type == BOOT_ENTRY_UNIFIED) { + boot_entry = entry; + break; + } + + if (!boot_entry) + FOREACH_ARRAY(entry, config.entries, config.n_entries) + if (entry->type == BOOT_ENTRY_CONF) { + boot_entry = entry; + break; + } + + if (!boot_entry) + return log_error_errno(SYNTHETIC_ERRNO(ENOENT), "Failed to discover any boot entries."); + + log_debug("Discovered boot entry %s (%s)", boot_entry->id, boot_entry_type_to_string(boot_entry->type)); + + _cleanup_free_ char *linux_kernel = NULL; + _cleanup_strv_free_ char **initrds = NULL; + if (boot_entry->type == BOOT_ENTRY_UNIFIED) { + linux_kernel = path_join(boot_entry->root, boot_entry->kernel); + if (!linux_kernel) + return log_oom(); + } else if (boot_entry->type == BOOT_ENTRY_CONF) { + linux_kernel = path_join(boot_entry->root, boot_entry->kernel); + if (!linux_kernel) + return log_oom(); + + STRV_FOREACH(initrd, boot_entry->initrd) { + _cleanup_free_ char *initrd_path = path_join(boot_entry->root, *initrd); + if (!initrd_path) + return log_oom(); + + r = strv_consume(&initrds, TAKE_PTR(initrd_path)); + if (r < 0) + return log_oom(); + } + } else + assert_not_reached(); + + *ret_linux = TAKE_PTR(linux_kernel); + *ret_initrds = TAKE_PTR(initrds); + + return 0; +} + +static int merge_initrds(char **ret) { + _cleanup_(rm_rf_physical_and_freep) char *merged_initrd = NULL; + _cleanup_close_ int ofd = -EBADF; + int r; + + assert(ret); + + r = tempfn_random_child(NULL, "vmspawn-initrd-", &merged_initrd); + if (r < 0) + return log_error_errno(r, "Failed to create temporary file: %m"); + + ofd = open(merged_initrd, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600); + if (ofd < 0) + return log_error_errno(errno, "Failed to create regular file %s: %m", merged_initrd); + + STRV_FOREACH(i, arg_initrds) { + _cleanup_close_ int ifd = -EBADF; + off_t off, to_seek; + + off = lseek(ofd, 0, SEEK_CUR); + if (off < 0) + return log_error_errno(errno, "Failed to get file offset of %s: %m", merged_initrd); + + to_seek = (4 - (off % 4)) % 4; + + /* seek to assure 4 byte alignment for each initrd */ + if (to_seek != 0 && lseek(ofd, to_seek, SEEK_CUR) < 0) + return log_error_errno(errno, "Failed to seek %s: %m", merged_initrd); + + ifd = open(*i, O_RDONLY|O_CLOEXEC); + if (ifd < 0) + return log_error_errno(errno, "Failed to open %s: %m", *i); + + r = copy_bytes(ifd, ofd, UINT64_MAX, COPY_REFLINK); + if (r < 0) + return log_error_errno(r, "Failed to copy bytes from %s to %s: %m", *i, merged_initrd); + } + + *ret = TAKE_PTR(merged_initrd); + return 0; +} + +static void set_window_title(PTYForward *f) { + _cleanup_free_ char *hn = NULL, *dot = NULL; + + assert(f); + + (void) gethostname_strict(&hn); + + if (emoji_enabled()) + dot = strjoin(special_glyph(SPECIAL_GLYPH_GREEN_CIRCLE), " "); + + if (hn) + (void) pty_forward_set_titlef(f, "%sVirtual Machine %s on %s", strempty(dot), arg_machine, hn); + else + (void) pty_forward_set_titlef(f, "%sVirtual Machine %s", strempty(dot), arg_machine); + + if (dot) + (void) pty_forward_set_title_prefix(f, dot); +} + +static int generate_ssh_keypair(const char *key_path, const char *key_type) { + _cleanup_free_ char *ssh_keygen = NULL; + _cleanup_strv_free_ char **cmdline = NULL; + int r; + + assert(key_path); + + r = find_executable("ssh-keygen", &ssh_keygen); + if (r < 0) + return log_error_errno(r, "Failed to find ssh-keygen: %m"); + + cmdline = strv_new(ssh_keygen, "-f", key_path, /* don't encrypt the key */ "-N", ""); + if (!cmdline) + return log_oom(); + + if (key_type) { + r = strv_extend_many(&cmdline, "-t", key_type); + if (r < 0) + return log_oom(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *joined = quote_command_line(cmdline, SHELL_ESCAPE_EMPTY); + if (!joined) + return log_oom(); + + log_debug("Executing: %s", joined); + } + + r = safe_fork( + ssh_keygen, + FORK_WAIT|FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_RLIMIT_NOFILE_SAFE|FORK_REARRANGE_STDIO, + NULL); + if (r < 0) + return r; + if (r == 0) { + execv(ssh_keygen, cmdline); + log_error_errno(errno, "Failed to execve %s: %m", ssh_keygen); + _exit(EXIT_FAILURE); + } + + return 0; +} + +static int run_virtual_machine(int kvm_device_fd, int vhost_device_fd) { + SSHInfo ssh_info; /* Used when talking to pid1 via SSH, but must survive until the function ends. */ _cleanup_(ovmf_config_freep) OvmfConfig *ovmf_config = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL, *trans_scope = NULL, *kernel = NULL; + _cleanup_(rm_rf_physical_and_freep) char *ssh_private_key_path = NULL, *ssh_public_key_path = NULL; + _cleanup_close_ int notify_sock_fd = -EBADF; _cleanup_strv_free_ char **cmdline = NULL; - _cleanup_free_ char *machine = NULL, *qemu_binary = NULL, *mem = NULL; + _cleanup_free_ int *pass_fds = NULL; + size_t n_pass_fds = 0; + const char *accel, *shm; int r; - _cleanup_close_ int vsock_fd = -EBADF; - bool use_kvm = arg_qemu_kvm > 0; - if (arg_qemu_kvm < 0) { + if (arg_privileged) + r = sd_bus_default_system(&bus); + else + r = sd_bus_default_user(&bus); + if (r < 0) + return log_error_errno(r, "Failed to connect to systemd bus: %m"); + + r = start_transient_scope(bus, arg_machine, /* allow_pidfd= */ true, &trans_scope); + if (r < 0) + return r; + + bool use_kvm = arg_kvm > 0; + if (arg_kvm < 0) { r = qemu_check_kvm_support(); if (r < 0) return log_error_errno(r, "Failed to check for KVM support: %m"); use_kvm = r; } - r = find_ovmf_config(arg_secure_boot, &ovmf_config); + if (arg_firmware) + r = load_ovmf_config(arg_firmware, &ovmf_config); + else + r = find_ovmf_config(arg_secure_boot, &ovmf_config); if (r < 0) return log_error_errno(r, "Failed to find OVMF config: %m"); @@ -458,114 +1359,279 @@ static int run_virtual_machine(void) { log_warning("Couldn't find OVMF firmware blob with Secure Boot support, " "falling back to OVMF firmware blobs without Secure Boot support."); - const char *accel = use_kvm ? "kvm" : "tcg"; - if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE)) - machine = strjoin("type=virt,accel=", accel); + shm = arg_directory || arg_runtime_mounts.n_mounts != 0 ? ",memory-backend=mem" : ""; + if (ARCHITECTURE_SUPPORTS_SMM) + machine = strjoin("type=" QEMU_MACHINE_TYPE ",smm=", on_off(ovmf_config->supports_sb), shm); else - machine = strjoin("type=q35,accel=", accel, ",smm=", on_off(ovmf_config->supports_sb)); + machine = strjoin("type=" QEMU_MACHINE_TYPE, shm); if (!machine) return log_oom(); + if (arg_linux) { + kernel = strdup(arg_linux); + if (!kernel) + return log_oom(); + } else if (arg_directory) { + /* a kernel is required for directory type images so attempt to locate a UKI under /boot and /efi */ + r = discover_boot_entry(arg_directory, &kernel, &arg_initrds); + if (r < 0) + return log_error_errno(r, "Failed to locate UKI in directory type image, please specify one with --linux=."); + + log_debug("Discovered UKI image at %s", kernel); + } + r = find_qemu_binary(&qemu_binary); if (r == -EOPNOTSUPP) return log_error_errno(r, "Native architecture is not supported by qemu."); if (r < 0) return log_error_errno(r, "Failed to find QEMU binary: %m"); - if (asprintf(&mem, "%.4fM", (double)arg_qemu_mem / (1024.0 * 1024.0)) < 0) + if (asprintf(&mem, "%" PRIu64 "M", DIV_ROUND_UP(arg_ram, U64_MB)) < 0) return log_oom(); cmdline = strv_new( qemu_binary, "-machine", machine, - "-smp", arg_qemu_smp ?: "1", + "-smp", arg_cpus ?: "1", "-m", mem, "-object", "rng-random,filename=/dev/urandom,id=rng0", "-device", "virtio-rng-pci,rng=rng0,id=rng-device0", - "-nic", "user,model=virtio-net-pci" + "-device", "virtio-balloon,free-page-reporting=on" ); if (!cmdline) return log_oom(); - bool use_vsock = arg_qemu_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS; - if (arg_qemu_vsock < 0) { - r = qemu_check_vsock_support(); + if (!sd_id128_is_null(arg_uuid)) + if (strv_extend_many(&cmdline, "-uuid", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0) + return log_oom(); + + /* Derive a vmgenid automatically from the invocation ID, in a deterministic way. */ + sd_id128_t vmgenid; + r = sd_id128_get_invocation_app_specific(SD_ID128_MAKE(bd,84,6d,e3,e4,7d,4b,6c,a6,85,4a,87,0f,3c,a3,a0), &vmgenid); + if (r < 0) { + log_debug_errno(r, "Failed to get invocation ID, making up randomized vmgenid: %m"); + + r = sd_id128_randomize(&vmgenid); if (r < 0) - return log_error_errno(r, "Failed to check for VSock support: %m"); + return log_error_errno(r, "Failed to make up randomized vmgenid: %m"); + } - use_vsock = r; + _cleanup_free_ char *vmgenid_device = NULL; + if (asprintf(&vmgenid_device, "vmgenid,guid=" SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(vmgenid)) < 0) + return log_oom(); + + if (strv_extend_many(&cmdline, "-device", vmgenid_device) < 0) + return log_oom(); + + /* if we are going to be starting any units with state then create our runtime dir */ + if (arg_tpm != 0 || arg_directory || arg_runtime_mounts.n_mounts != 0) { + r = runtime_directory(&arg_runtime_directory, arg_privileged ? RUNTIME_SCOPE_SYSTEM : RUNTIME_SCOPE_USER, "systemd/vmspawn"); + if (r < 0) + return log_error_errno(r, "Failed to lookup runtime directory: %m"); + if (r) { + /* r > 0 means we need to create our own runtime dir */ + r = mkdir_p(arg_runtime_directory, 0755); + if (r < 0) + return log_error_errno(r, "Failed to create runtime directory: %m"); + arg_runtime_directory_created = true; + } } - unsigned child_cid = VMADDR_CID_ANY; - _cleanup_close_ int child_vsock_fd = -EBADF; - if (use_vsock) { - if (arg_vsock_cid < UINT_MAX) - child_cid = (unsigned)arg_vsock_cid; + if (arg_network_stack == NETWORK_STACK_TAP) { + _cleanup_free_ char *tap_name = NULL; + struct ether_addr mac_vm = {}; + + tap_name = strjoin("tp-", arg_machine); + if (!tap_name) + return log_oom(); + + (void) net_shorten_ifname(tap_name, /* check_naming_scheme= */ false); + + if (ether_addr_is_null(&arg_network_provided_mac)){ + r = net_generate_mac(arg_machine, &mac_vm, VM_TAP_HASH_KEY, 0); + if (r < 0) + return log_error_errno(r, "Failed to generate predictable MAC address for VM side: %m"); + } else + mac_vm = arg_network_provided_mac; - r = vsock_fix_child_cid(&child_cid, arg_machine, &child_vsock_fd); + r = strv_extend(&cmdline, "-nic"); if (r < 0) - return log_error_errno(r, "Failed to fix CID for the guest vsock socket: %m"); + return log_oom(); - r = strv_extend(&cmdline, "-device"); + r = strv_extendf(&cmdline, "tap,ifname=%s,script=no,model=virtio-net-pci,mac=%s", tap_name, ETHER_ADDR_TO_STR(&mac_vm)); if (r < 0) return log_oom(); + } else if (arg_network_stack == NETWORK_STACK_USER) + r = strv_extend_many(&cmdline, "-nic", "user,model=virtio-net-pci"); + else + r = strv_extend_many(&cmdline, "-nic", "none"); + if (r < 0) + return log_oom(); - log_debug("vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); - r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, child_vsock_fd); + /* A shared memory backend might increase ram usage so only add one if actually necessary for virtiofsd. */ + if (arg_directory || arg_runtime_mounts.n_mounts != 0) { + r = strv_extend(&cmdline, "-object"); if (r < 0) return log_oom(); + + r = strv_extendf(&cmdline, "memory-backend-memfd,id=mem,size=%s,share=on", mem); + if (r < 0) + return log_oom(); + } + + bool use_vsock = arg_vsock > 0 && ARCHITECTURE_SUPPORTS_SMBIOS; + if (arg_vsock < 0) { + r = qemu_check_vsock_support(); + if (r < 0) + return log_error_errno(r, "Failed to check for VSOCK support: %m"); + + use_vsock = r; } - r = strv_extend_strv(&cmdline, STRV_MAKE("-cpu", "max"), /* filter_duplicates= */ false); + if (!use_kvm && kvm_device_fd >= 0) { + log_warning("KVM is disabled but fd for /dev/kvm was passed, closing fd and ignoring"); + kvm_device_fd = safe_close(kvm_device_fd); + } + + if (use_kvm && kvm_device_fd >= 0) { + /* /dev/fdset/1 is magic string to tell qemu where to find the fd for /dev/kvm + * we use this so that we can take a fd to /dev/kvm and then give qemu that fd */ + accel = "kvm,device=/dev/fdset/1"; + + r = strv_extend(&cmdline, "--add-fd"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "fd=%d,set=1,opaque=/dev/kvm", kvm_device_fd); + if (r < 0) + return log_oom(); + + if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1)) + return log_oom(); + + pass_fds[n_pass_fds++] = kvm_device_fd; + } else if (use_kvm) + accel = "kvm"; + else + accel = "tcg"; + + r = strv_extend_many(&cmdline, "-accel", accel); if (r < 0) return log_oom(); - if (arg_qemu_gui) { - r = strv_extend_strv(&cmdline, STRV_MAKE("-vga", "virtio"), /* filter_duplicates= */ false); + _cleanup_close_ int child_vsock_fd = -EBADF; + unsigned child_cid = arg_vsock_cid; + if (use_vsock) { + int device_fd = vhost_device_fd; + + if (device_fd < 0) { + child_vsock_fd = open("/dev/vhost-vsock", O_RDWR|O_CLOEXEC); + if (child_vsock_fd < 0) + return log_error_errno(errno, "Failed to open /dev/vhost-vsock as read/write: %m"); + + device_fd = child_vsock_fd; + } + + r = vsock_fix_child_cid(device_fd, &child_cid, arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to fix CID for the guest VSOCK socket: %m"); + + r = strv_extend(&cmdline, "-device"); if (r < 0) return log_oom(); - } else { - r = strv_extend_strv(&cmdline, STRV_MAKE( - "-nographic", - "-nodefaults", - "-chardev", "stdio,mux=on,id=console,signal=off", - "-serial", "chardev:console", - "-mon", "console" - ), /* filter_duplicates= */ false); + + r = strv_extendf(&cmdline, "vhost-vsock-pci,guest-cid=%u,vhostfd=%d", child_cid, device_fd); if (r < 0) return log_oom(); + + if (!GREEDY_REALLOC(pass_fds, n_pass_fds + 1)) + return log_oom(); + + pass_fds[n_pass_fds++] = device_fd; } - if (ARCHITECTURE_SUPPORTS_SMBIOS) { - ssize_t n; - FOREACH_ARRAY(cred, arg_credentials, arg_n_credentials) { - _cleanup_free_ char *cred_data_b64 = NULL; + r = strv_extend_many(&cmdline, "-cpu", +#ifdef __x86_64__ + "max,hv_relaxed,hv-vapic,hv-time" +#else + "max" +#endif + ); + if (r < 0) + return log_oom(); - n = base64mem(cred->data, cred->size, &cred_data_b64); - if (n < 0) - return log_oom(); + _cleanup_close_ int master = -EBADF; + PTYForwardFlags ptyfwd_flags = 0; + switch (arg_console_mode) { - r = strv_extend(&cmdline, "-smbios"); - if (r < 0) - return log_oom(); + case CONSOLE_READ_ONLY: + ptyfwd_flags |= PTY_FORWARD_READ_ONLY; - r = strv_extendf(&cmdline, "type=11,value=io.systemd.credential.binary:%s=%s", cred->id, cred_data_b64); - if (r < 0) - return log_oom(); - } + _fallthrough_; + + case CONSOLE_INTERACTIVE: { + _cleanup_free_ char *pty_path = NULL; + + master = openpt_allocate(O_RDWR|O_NONBLOCK, &pty_path); + if (master < 0) + return log_error_errno(master, "Failed to setup pty: %m"); + + if (strv_extend_many( + &cmdline, + "-nographic", + "-nodefaults", + "-chardev") < 0) + return log_oom(); + + if (strv_extendf(&cmdline, + "serial,id=console,path=%s", pty_path) < 0) + return log_oom(); + + r = strv_extend_many( + &cmdline, + "-serial", "chardev:console"); + break; + } + + case CONSOLE_GUI: + r = strv_extend_many( + &cmdline, + "-vga", + "virtio"); + break; + + case CONSOLE_NATIVE: + r = strv_extend_many( + &cmdline, + "-nographic", + "-nodefaults", + "-chardev", "stdio,mux=on,id=console,signal=off", + "-serial", "chardev:console", + "-mon", "console"); + break; + + default: + assert_not_reached(); } + if (r < 0) + return log_oom(); r = strv_extend(&cmdline, "-drive"); if (r < 0) return log_oom(); - r = strv_extendf(&cmdline, "if=pflash,format=raw,readonly=on,file=%s", ovmf_config->path); + _cleanup_free_ char *escaped_ovmf_config_path = escape_qemu_value(ovmf_config->path); + if (!escaped_ovmf_config_path) + return log_oom(); + + r = strv_extendf(&cmdline, "if=pflash,format=%s,readonly=on,file=%s", ovmf_config_format(ovmf_config), escaped_ovmf_config_path); if (r < 0) return log_oom(); _cleanup_(unlink_and_freep) char *ovmf_vars_to = NULL; if (ovmf_config->supports_sb) { const char *ovmf_vars_from = ovmf_config->vars; + _cleanup_free_ char *escaped_ovmf_vars_to = NULL; _cleanup_close_ int source_fd = -EBADF, target_fd = -EBADF; r = tempfn_random_child(NULL, "vmspawn-", &ovmf_vars_to); @@ -589,63 +1655,413 @@ static int run_virtual_machine(void) { (void) copy_access(source_fd, target_fd); (void) copy_times(source_fd, target_fd, 0); - r = strv_extend_strv(&cmdline, STRV_MAKE( - "-global", "ICH9-LPC.disable_s3=1", - "-global", "driver=cfi.pflash01,property=secure,value=on", - "-drive" - ), /* filter_duplicates= */ false); + r = strv_extend_many( + &cmdline, + "-global", "ICH9-LPC.disable_s3=1", + "-global", "driver=cfi.pflash01,property=secure,value=on", + "-drive"); if (r < 0) return log_oom(); - r = strv_extendf(&cmdline, "file=%s,if=pflash,format=raw", ovmf_vars_to); + escaped_ovmf_vars_to = escape_qemu_value(ovmf_vars_to); + if (!escaped_ovmf_vars_to) + return log_oom(); + + r = strv_extendf(&cmdline, "file=%s,if=pflash,format=%s", escaped_ovmf_vars_to, ovmf_config_format(ovmf_config)); if (r < 0) return log_oom(); } - r = strv_extend(&cmdline, "-drive"); - if (r < 0) - return log_oom(); + STRV_FOREACH(drive, arg_extra_drives) { + _cleanup_free_ char *escaped_drive = NULL; - r = strv_extendf(&cmdline, "if=none,id=mkosi,file=%s,format=raw", arg_image); - if (r < 0) - return log_oom(); + r = strv_extend(&cmdline, "-drive"); + if (r < 0) + return log_oom(); - r = strv_extend_strv(&cmdline, STRV_MAKE( - "-device", "virtio-scsi-pci,id=scsi", - "-device", "scsi-hd,drive=mkosi,bootindex=1" - ), /* filter_duplicates= */ false); + escaped_drive = escape_qemu_value(*drive); + if (!escaped_drive) + return log_oom(); + + r = strv_extendf(&cmdline, "format=raw,cache=unsafe,file=%s", escaped_drive); + if (r < 0) + return log_oom(); + } + + if (kernel) { + r = strv_extend_many(&cmdline, "-kernel", kernel); + if (r < 0) + return log_oom(); + + /* We can't rely on gpt-auto-generator when direct kernel booting so synthesize a root= + * kernel argument instead. */ + if (arg_image) { + r = kernel_cmdline_maybe_append_root(); + if (r < 0) + return r; + } + } + + if (arg_image) { + _cleanup_free_ char *escaped_image = NULL; + + assert(!arg_directory); + + r = strv_extend(&cmdline, "-drive"); + if (r < 0) + return log_oom(); + + escaped_image = escape_qemu_value(arg_image); + if (!escaped_image) + log_oom(); + + r = strv_extendf(&cmdline, "if=none,id=mkosi,file=%s,format=raw,discard=%s", escaped_image, on_off(arg_discard_disk)); + if (r < 0) + return log_oom(); + + r = strv_extend_many(&cmdline, + "-device", "virtio-scsi-pci,id=scsi", + "-device", "scsi-hd,drive=mkosi,bootindex=1"); + if (r < 0) + return log_oom(); + } + + if (arg_directory) { + _cleanup_free_ char *sock_path = NULL, *sock_name = NULL, *escaped_sock_path = NULL; + + r = start_virtiofsd(bus, trans_scope, arg_directory, /* uidmap= */ true, &sock_path, &sock_name); + if (r < 0) + return r; + + escaped_sock_path = escape_qemu_value(sock_path); + if (!escaped_sock_path) + log_oom(); + + r = strv_extend(&cmdline, "-chardev"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "socket,id=%1$s,path=%2$s/%1$s", sock_name, escaped_sock_path); + if (r < 0) + return log_oom(); + + r = strv_extend(&cmdline, "-device"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "vhost-user-fs-pci,queue-size=1024,chardev=%s,tag=root", sock_name); + if (r < 0) + return log_oom(); + + r = strv_extend(&arg_kernel_cmdline_extra, "root=root rootfstype=virtiofs rw"); + if (r < 0) + return log_oom(); + } + + r = strv_prepend(&arg_kernel_cmdline_extra, "console=" DEFAULT_SERIAL_TTY); if (r < 0) return log_oom(); - if (!strv_isempty(arg_parameters)) { - if (ARCHITECTURE_SUPPORTS_SMBIOS) { - _cleanup_free_ char *kcl = strv_join(arg_parameters, " "); - if (!kcl) + FOREACH_ARRAY(mount, arg_runtime_mounts.mounts, arg_runtime_mounts.n_mounts) { + _cleanup_free_ char *sock_path = NULL, *sock_name = NULL, *clean_target = NULL, *escaped_sock_path = NULL; + r = start_virtiofsd(bus, trans_scope, mount->source, /* uidmap= */ false, &sock_path, &sock_name); + if (r < 0) + return r; + + escaped_sock_path = escape_qemu_value(sock_path); + if (!escaped_sock_path) + log_oom(); + + r = strv_extend(&cmdline, "-chardev"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "socket,id=%1$s,path=%2$s/%1$s", sock_name, escaped_sock_path); + if (r < 0) + return log_oom(); + + r = strv_extend(&cmdline, "-device"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "vhost-user-fs-pci,queue-size=1024,chardev=%1$s,tag=%1$s", sock_name); + if (r < 0) + return log_oom(); + + clean_target = xescape(mount->target, "\":"); + if (!clean_target) + return log_oom(); + + r = strv_extendf(&arg_kernel_cmdline_extra, "systemd.mount-extra=\"%s:%s:virtiofs:%s\"", + sock_name, clean_target, mount->read_only ? "ro" : "rw"); + if (r < 0) + return log_oom(); + } + + if (ARCHITECTURE_SUPPORTS_SMBIOS) { + _cleanup_free_ char *kcl = strv_join(arg_kernel_cmdline_extra, " "), *escaped_kcl = NULL; + if (!kcl) + return log_oom(); + + if (kernel) { + r = strv_extend_many(&cmdline, "-append", kcl); + if (r < 0) + return log_oom(); + } else { + if (ARCHITECTURE_SUPPORTS_SMBIOS) { + escaped_kcl = escape_qemu_value(kcl); + if (!escaped_kcl) + log_oom(); + + r = strv_extend(&cmdline, "-smbios"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "type=11,value=io.systemd.stub.kernel-cmdline-extra=%s", escaped_kcl); + if (r < 0) + return log_oom(); + + r = strv_extend(&cmdline, "-smbios"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "type=11,value=io.systemd.boot.kernel-cmdline-extra=%s", escaped_kcl); + if (r < 0) + return log_oom(); + } else + log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS, ignoring"); + } + } else + log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS"); + + /* disable TPM autodetection if the user's hardware doesn't support it */ + if (!ARCHITECTURE_SUPPORTS_TPM) { + if (arg_tpm < 0) { + arg_tpm = 0; + log_debug("TPM not support on %s, disabling tpm autodetection and continuing", architecture_to_string(native_architecture())); + } else if (arg_tpm > 0) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "TPM not supported on %s, refusing", architecture_to_string(native_architecture())); + } + + _cleanup_free_ char *swtpm = NULL; + if (arg_tpm != 0) { + r = find_executable("swtpm", &swtpm); + if (r < 0) { + /* log if the user asked for swtpm and we cannot find it */ + if (arg_tpm > 0) + return log_error_errno(r, "Failed to find swtpm binary: %m"); + /* also log if we got an error other than ENOENT from find_executable */ + if (r != -ENOENT && arg_tpm < 0) + return log_error_errno(r, "Error detecting swtpm: %m"); + } + } + + _cleanup_free_ char *tpm_state_tempdir = NULL; + if (swtpm) { + r = start_tpm(bus, trans_scope, swtpm, &tpm_state_tempdir); + if (r < 0) { + /* only bail if the user asked for a tpm */ + if (arg_tpm > 0) + return log_error_errno(r, "Failed to start tpm: %m"); + log_debug_errno(r, "Failed to start tpm, ignoring: %m"); + } + } + + if (tpm_state_tempdir) { + _cleanup_free_ char *escaped_state_dir = NULL; + + escaped_state_dir = escape_qemu_value(tpm_state_tempdir); + if (!escaped_state_dir) + log_oom(); + + r = strv_extend(&cmdline, "-chardev"); + if (r < 0) + return log_oom(); + + r = strv_extendf(&cmdline, "socket,id=chrtpm,path=%s/sock", escaped_state_dir); + if (r < 0) + return log_oom(); + + r = strv_extend_many(&cmdline, "-tpmdev", "emulator,id=tpm0,chardev=chrtpm"); + if (r < 0) + return log_oom(); + + if (native_architecture() == ARCHITECTURE_X86_64) + r = strv_extend_many(&cmdline, "-device", "tpm-tis,tpmdev=tpm0"); + else if (IN_SET(native_architecture(), ARCHITECTURE_ARM64, ARCHITECTURE_ARM64_BE)) + r = strv_extend_many(&cmdline, "-device", "tpm-tis-device,tpmdev=tpm0"); + if (r < 0) + return log_oom(); + } + + char *initrd = NULL; + _cleanup_(rm_rf_physical_and_freep) char *merged_initrd = NULL; + size_t n_initrds = strv_length(arg_initrds); + + if (n_initrds == 1) + initrd = arg_initrds[0]; + else if (n_initrds > 1) { + r = merge_initrds(&merged_initrd); + if (r < 0) + return r; + + initrd = merged_initrd; + } + + if (initrd) { + r = strv_extend_many(&cmdline, "-initrd", initrd); + if (r < 0) + return log_oom(); + } + + if (arg_forward_journal) { + _cleanup_free_ char *sd_journal_remote = NULL, *listen_address = NULL, *cred = NULL; + + r = find_executable_full( + "systemd-journal-remote", + /* root = */ NULL, + STRV_MAKE(LIBEXECDIR), + /* use_path_envvar = */ true, /* systemd-journal-remote should be installed in + * LIBEXECDIR, but for supporting fancy setups. */ + &sd_journal_remote, + /* ret_fd = */ NULL); + if (r < 0) + return log_error_errno(r, "Failed to find systemd-journal-remote binary: %m"); + + r = start_systemd_journal_remote(bus, trans_scope, child_cid, sd_journal_remote, &listen_address); + if (r < 0) + return r; + + cred = strjoin("journal.forward_to_socket:", listen_address); + if (!cred) + return log_oom(); + + r = machine_credential_set(&arg_credentials, cred); + if (r < 0) + return r; + } + + if (arg_pass_ssh_key) { + _cleanup_free_ char *scope_prefix = NULL, *privkey_path = NULL, *pubkey_path = NULL; + const char *key_type = arg_ssh_key_type ?: "ed25519"; + + r = unit_name_to_prefix(trans_scope, &scope_prefix); + if (r < 0) + return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); + + privkey_path = strjoin(arg_runtime_directory, "/", scope_prefix, "-", key_type); + if (!privkey_path) + return log_oom(); + + pubkey_path = strjoin(privkey_path, ".pub"); + if (!pubkey_path) + return log_oom(); + + r = generate_ssh_keypair(privkey_path, key_type); + if (r < 0) + return r; + + ssh_private_key_path = TAKE_PTR(privkey_path); + ssh_public_key_path = TAKE_PTR(pubkey_path); + } + + if (ssh_public_key_path && ssh_private_key_path) { + _cleanup_free_ char *scope_prefix = NULL, *cred_path = NULL; + + cred_path = strjoin("ssh.ephemeral-authorized_keys-all:", ssh_public_key_path); + if (!cred_path) + return log_oom(); + + r = machine_credential_load(&arg_credentials, cred_path); + if (r < 0) + return log_error_errno(r, "Failed to load credential %s: %m", cred_path); + + r = unit_name_to_prefix(trans_scope, &scope_prefix); + if (r < 0) + return log_error_errno(r, "Failed to strip .scope suffix from scope: %m"); + + /* on distros that provide their own sshd@.service file we need to provide a dropin which + * picks up our public key credential */ + r = machine_credential_set( + &arg_credentials, + "systemd.unit-dropin.sshd-vsock@.service:" + "[Service]\n" + "ExecStart=\n" + "ExecStart=sshd -i -o 'AuthorizedKeysFile=%d/ssh.ephemeral-authorized_keys-all .ssh/authorized_keys'\n" + "ImportCredential=ssh.ephemeral-authorized_keys-all\n"); + if (r < 0) + return log_error_errno(r, "Failed to set credential systemd.unit-dropin.sshd-vsock@.service: %m"); + } + + if (ARCHITECTURE_SUPPORTS_SMBIOS) + FOREACH_ARRAY(cred, arg_credentials.credentials, arg_credentials.n_credentials) { + _cleanup_free_ char *cred_data_b64 = NULL; + ssize_t n; + + n = base64mem(cred->data, cred->size, &cred_data_b64); + if (n < 0) return log_oom(); r = strv_extend(&cmdline, "-smbios"); if (r < 0) return log_oom(); - r = strv_extendf(&cmdline, "type=11,value=io.systemd.stub.kernel-cmdline-extra=%s", kcl); + r = strv_extendf(&cmdline, "type=11,value=io.systemd.credential.binary:%s=%s", cred->id, cred_data_b64); if (r < 0) return log_oom(); - } else - log_warning("Cannot append extra args to kernel cmdline, native architecture doesn't support SMBIOS"); - } + } if (use_vsock) { - vsock_fd = open_vsock(); - if (vsock_fd < 0) - return log_error_errno(vsock_fd, "Failed to open vsock: %m"); + notify_sock_fd = open_vsock(); + if (notify_sock_fd < 0) + return log_error_errno(notify_sock_fd, "Failed to open VSOCK: %m"); - r = cmdline_add_vsock(&cmdline, vsock_fd); + r = cmdline_add_vsock(&cmdline, notify_sock_fd); if (r == -ENOMEM) return log_oom(); if (r < 0) - return log_error_errno(r, "Failed to call getsockname on vsock: %m"); + return log_error_errno(r, "Failed to call getsockname on VSOCK: %m"); + } + + const char *e = secure_getenv("SYSTEMD_VMSPAWN_QEMU_EXTRA"); + if (e) { + _cleanup_strv_free_ char **extra = NULL; + + r = strv_split_full(&extra, e, /* separator= */ NULL, EXTRACT_CUNESCAPE|EXTRACT_UNQUOTE); + if (r < 0) + return log_error_errno(r, "Failed to split $SYSTEMD_VMSPAWN_QEMU_EXTRA environment variable: %m"); + + if (strv_extend_strv(&cmdline, extra, /* filter_duplicates= */ false) < 0) + return log_oom(); + } + + if (DEBUG_LOGGING) { + _cleanup_free_ char *joined = quote_command_line(cmdline, SHELL_ESCAPE_EMPTY); + if (!joined) + return log_oom(); + + log_debug("Executing: %s", joined); + } + + if (arg_register) { + char vm_address[STRLEN("vsock/") + DECIMAL_STR_MAX(unsigned)]; + + xsprintf(vm_address, "vsock/%u", child_cid); + r = register_machine( + bus, + arg_machine, + arg_uuid, + trans_scope, + arg_directory, + child_cid, + child_cid != VMADDR_CID_ANY ? vm_address : NULL, + ssh_private_key_path); + if (r < 0) + return r; } + assert_se(sigprocmask_many(SIG_BLOCK, /* old_sigset=*/ NULL, SIGCHLD, SIGWINCH) >= 0); + _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; r = sd_event_new(&event); @@ -654,15 +2070,16 @@ static int run_virtual_machine(void) { (void) sd_event_set_watchdog(event, true); - pid_t child_pid; - r = safe_fork_full( + _cleanup_(pidref_done) PidRef child_pidref = PIDREF_NULL; + + r = pidref_safe_fork_full( qemu_binary, - NULL, - &child_vsock_fd, 1, /* pass the vsock fd to qemu */ - FORK_CLOEXEC_OFF, - &child_pid); + /* stdio_fds= */ NULL, + pass_fds, n_pass_fds, + FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_DEATHSIG_SIGTERM|FORK_LOG|FORK_CLOEXEC_OFF|FORK_RLIMIT_NOFILE_SAFE, + &child_pidref); if (r < 0) - return log_error_errno(r, "Failed to fork off %s: %m", qemu_binary); + return r; if (r == 0) { /* set TERM and LANG if they are missing */ if (setenv("TERM", "vt220", 0) < 0) @@ -671,35 +2088,72 @@ static int run_virtual_machine(void) { if (setenv("LANG", "C.UTF-8", 0) < 0) return log_oom(); - execve(qemu_binary, cmdline, environ); + execv(qemu_binary, cmdline); log_error_errno(errno, "Failed to execve %s: %m", qemu_binary); _exit(EXIT_FAILURE); } + /* Close the vsock fd we passed to qemu in the parent. We don't need it anymore. */ + child_vsock_fd = safe_close(child_vsock_fd); int exit_status = INT_MAX; if (use_vsock) { - r = setup_notify_parent(event, vsock_fd, &exit_status, ¬ify_event_source); + r = setup_notify_parent(event, notify_sock_fd, &exit_status, ¬ify_event_source); if (r < 0) - return log_error_errno(r, "Failed to setup event loop to handle vsock notify events: %m"); + return log_error_errno(r, "Failed to setup event loop to handle VSOCK notify events: %m"); } - /* shutdown qemu when we are shutdown */ - (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(child_pid)); - (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(child_pid)); + /* If we have the vsock address and the SSH key, ask pid1 inside the guest to shutdown. */ + if (child_cid != VMADDR_CID_ANY && ssh_private_key_path) { + ssh_info = (SSHInfo) { + .cid = child_cid, + .private_key_path = ssh_private_key_path, + .port = 22, + }; - (void) sd_event_add_signal(event, NULL, SIGRTMIN+18, sigrtmin18_handler, NULL); + (void) sd_event_add_signal(event, NULL, SIGINT | SD_EVENT_SIGNAL_PROCMASK, forward_signal_to_vm_pid1, &ssh_info); + (void) sd_event_add_signal(event, NULL, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, forward_signal_to_vm_pid1, &ssh_info); + } else { + /* As a fallback in case SSH cannot be used, send a shutdown signal to the VMM instead. */ + (void) sd_event_add_signal(event, NULL, SIGINT | SD_EVENT_SIGNAL_PROCMASK, on_orderly_shutdown, &child_pidref); + (void) sd_event_add_signal(event, NULL, SIGTERM | SD_EVENT_SIGNAL_PROCMASK, on_orderly_shutdown, &child_pidref); + } + + (void) sd_event_add_signal(event, NULL, (SIGRTMIN+18) | SD_EVENT_SIGNAL_PROCMASK, sigrtmin18_handler, NULL); /* Exit when the child exits */ - (void) sd_event_add_child(event, NULL, child_pid, WEXITED, on_child_exit, NULL); + (void) event_add_child_pidref(event, NULL, &child_pidref, WEXITED, on_child_exit, NULL); + + _cleanup_(pty_forward_freep) PTYForward *forward = NULL; + if (master >= 0) { + r = pty_forward_new(event, master, ptyfwd_flags, &forward); + if (r < 0) + return log_error_errno(r, "Failed to create PTY forwarder: %m"); + + if (!arg_background && shall_tint_background()) { + _cleanup_free_ char *bg = NULL; + + r = terminal_tint_color(130 /* green */, &bg); + if (r < 0) + log_debug_errno(r, "Failed to determine terminal background color, not tinting."); + else + (void) pty_forward_set_background_color(forward, bg); + } else if (!isempty(arg_background)) + (void) pty_forward_set_background_color(forward, arg_background); + + set_window_title(forward); + } r = sd_event_loop(event); if (r < 0) return log_error_errno(r, "Failed to run event loop: %m"); + if (arg_register) + (void) unregister_machine(bus, arg_machine); + if (use_vsock) { if (exit_status == INT_MAX) { - log_debug("Couldn't retrieve inner EXIT_STATUS from vsock"); + log_debug("Couldn't retrieve inner EXIT_STATUS from VSOCK"); return EXIT_SUCCESS; } if (exit_status != 0) @@ -713,20 +2167,52 @@ static int run_virtual_machine(void) { static int determine_names(void) { int r; - if (!arg_image) - return log_error_errno(SYNTHETIC_ERRNO(-EINVAL), "Missing required argument -i/--image=, quitting"); + if (!arg_directory && !arg_image) { + if (arg_machine) { + _cleanup_(image_unrefp) Image *i = NULL; - if (!arg_machine) { - char *e; + r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i); + if (r == -ENOENT) + return log_error_errno(r, "No image for machine '%s'.", arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine); + + if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK)) + r = free_and_strdup(&arg_image, i->path); + else if (IN_SET(i->type, IMAGE_DIRECTORY, IMAGE_SUBVOLUME)) + r = free_and_strdup(&arg_directory, i->path); + else + assert_not_reached(); + if (r < 0) + return log_oom(); + } else { + r = safe_getcwd(&arg_directory); + if (r < 0) + return log_error_errno(r, "Failed to determine current directory: %m"); + } + } - r = path_extract_filename(arg_image, &arg_machine); - if (r < 0) - return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image); + if (!arg_machine) { + if (arg_directory && path_equal(arg_directory, "/")) { + arg_machine = gethostname_malloc(); + if (!arg_machine) + return log_oom(); + } else if (arg_image) { + char *e; - /* Truncate suffix if there is one */ - e = endswith(arg_machine, ".raw"); - if (e) - *e = 0; + r = path_extract_filename(arg_image, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_image); + + /* Truncate suffix if there is one */ + e = endswith(arg_machine, ".raw"); + if (e) + *e = 0; + } else { + r = path_extract_filename(arg_directory, &arg_machine); + if (r < 0) + return log_error_errno(r, "Failed to extract file name from '%s': %m", arg_directory); + } hostname_cleanup(arg_machine); if (!hostname_is_valid(arg_machine, 0)) @@ -736,31 +2222,79 @@ static int determine_names(void) { return 0; } +static int verify_arguments(void) { + if (arg_network_stack == NETWORK_STACK_TAP && !arg_privileged) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "--network-tap requires root privileges, refusing."); + + if (!strv_isempty(arg_initrds) && !arg_linux) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Option --initrd= cannot be used without --linux=."); + + if (arg_register && !arg_privileged) + return log_error_errno(SYNTHETIC_ERRNO(EPERM), "--register= requires root privileges, refusing."); + + return 0; +} + static int run(int argc, char *argv[]) { - int r, ret = EXIT_SUCCESS; + int r, kvm_device_fd = -EBADF, vhost_device_fd = -EBADF; + _cleanup_strv_free_ char **names = NULL; log_setup(); + arg_privileged = getuid() == 0; + + /* don't attempt to register as a machine when running as a user */ + arg_register = arg_privileged; + + r = parse_environment(); + if (r < 0) + return r; + r = parse_argv(argc, argv); if (r <= 0) - goto finish; + return r; r = determine_names(); if (r < 0) - goto finish; + return r; + + r = verify_arguments(); + if (r < 0) + return r; - assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGTERM, SIGINT, SIGRTMIN+18, -1) >= 0); + if (!arg_quiet && arg_console_mode != CONSOLE_GUI) { + _cleanup_free_ char *u = NULL; + const char *vm_path = arg_image ?: arg_directory; + (void) terminal_urlify_path(vm_path, vm_path, &u); - r = run_virtual_machine(); - if (r > 0) - ret = r; -finish: - machine_credential_free_all(arg_credentials, arg_n_credentials); + log_info("%s %sSpawning VM %s on %s.%s", + special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), arg_machine, u ?: vm_path, ansi_normal()); + if (arg_console_mode == CONSOLE_INTERACTIVE) + log_info("%s %sPress %sCtrl-]%s three times within 1s to kill VM.%s", + special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal()); + else if (arg_console_mode == CONSOLE_NATIVE) + log_info("%s %sPress %sCtrl-a x%s to kill VM.%s", + special_glyph(SPECIAL_GLYPH_LIGHT_SHADE), ansi_grey(), ansi_highlight(), ansi_grey(), ansi_normal()); + } + + r = sd_listen_fds_with_names(true, &names); if (r < 0) - return r; + return log_error_errno(r, "Failed to get passed file descriptors: %m"); + + for (int i = 0; i < r; i++) { + int fd = SD_LISTEN_FDS_START + i; + if (streq(names[i], "kvm")) + kvm_device_fd = fd; + else if (streq(names[i], "vhost-vsock")) + vhost_device_fd = fd; + else { + log_notice("Couldn't recognize passed fd %d (%s), closing fd and ignoring...", fd, names[i]); + safe_close(fd); + } + } - return ret; + return run_virtual_machine(kvm_device_fd, vhost_device_fd); } DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run); |