diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 13:00:47 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-27 13:00:47 +0000 |
commit | 2cb7e0aaedad73b076ea18c6900b0e86c5760d79 (patch) | |
tree | da68ca54bb79f4080079bf0828acda937593a4e1 /src/nspawn/nspawn-oci.c | |
parent | Initial commit. (diff) | |
download | systemd-upstream.tar.xz systemd-upstream.zip |
Adding upstream version 247.3.upstream/247.3upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/nspawn/nspawn-oci.c | 2258 |
1 files changed, 2258 insertions, 0 deletions
diff --git a/src/nspawn/nspawn-oci.c b/src/nspawn/nspawn-oci.c new file mode 100644 index 0000000..ca708be --- /dev/null +++ b/src/nspawn/nspawn-oci.c @@ -0,0 +1,2258 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <linux/oom.h> +#if HAVE_SECCOMP +#include <seccomp.h> +#endif + +#include "bus-util.h" +#include "cap-list.h" +#include "cpu-set-util.h" +#include "env-util.h" +#include "format-util.h" +#include "fs-util.h" +#include "hostname-util.h" +#include "json.h" +#include "missing_sched.h" +#include "nspawn-oci.h" +#include "path-util.h" +#include "rlimit-util.h" +#if HAVE_SECCOMP +#include "seccomp-util.h" +#endif +#include "stat-util.h" +#include "stdio-util.h" +#include "string-util.h" +#include "strv.h" +#include "user-util.h" + +/* TODO: + * OCI runtime tool implementation + * hooks + * + * Spec issues: + * + * How is RLIM_INFINITY supposed to be encoded? + * configured effective caps is bullshit, as execv() corrupts it anyway + * pipes bind mounted is *very* different from pipes newly created, comments regarding bind mount or not are bogus + * annotation values structured? or string? + * configurable file system namespace path, but then also root path? wtf? + * apply sysctl inside of the container? or outside? + * how is unlimited pids tasks limit to be encoded? + * what are the defaults for caps if not specified? + * what are the default uid/gid mappings if one is missing but the other set, or when user ns is on but no namespace configured + * the source field of "mounts" is really weird, as it cannot realistically be relative to the bundle, since we never know if that's what the fs wants + * spec contradicts itself on the mount "type" field, as the example uses "bind" as type, but it's not listed in /proc/filesystem, and is something made up by /bin/mount + * if type of mount is left out, what shall be assumed? "bind"? + * readonly mounts is entirely redundant? + * should escaping be applied when joining mount options with ","? + * devices cgroup support is bogus, "allow" and "deny" on the kernel level is about adding/removing entries, not about access + * spec needs to say that "rwm" devices cgroup combination can't be the empty string + * cgrouspv1 crap: kernel, kernelTCP, swapiness, disableOOMKiller, swap, devices, leafWeight + * general: it shouldn't leak lower level abstractions this obviously + * unmanagable cgroups stuff: realtimeRuntime/realtimePeriod + * needs to say what happense when some option is not specified, i.e. which defaults apply + * no architecture? no personality? + * seccomp example and logic is simply broken: there's no constant "SCMP_ACT_ERRNO". + * spec should say what to do with unknown props + * /bin/mount regarding NFS and FUSE required? + * what does terminal=false mean? + * sysctl inside or outside? allow-listing? + * swapiness typo -> swappiness + * + * Unsupported: + * + * apparmorProfile + * selinuxLabel + mountLabel + * hugepageLimits + * network + * rdma + * intelRdt + * swappiness, disableOOMKiller, kernel, kernelTCP, leafWeight (because it's dead, cgroupsv2 can't do it and hence systemd neither) + * + * Non-slice cgroup paths + * Propagation that is not slave + shared + * more than one uid/gid mapping, mappings with a container base != 0, or non-matching uid/gid mappings + * device cgroups access = false items that are not catchall + * device cgroups matches where minor is specified, but major isn't. similar where major is specified but char/block is not. also, any match that only has a type set that has less than "rwm" set. also, any entry that has none of rwm set. + * + */ + +static int oci_unexpected(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unexpected OCI element '%s' of type '%s'.", name, json_variant_type_to_string(json_variant_type(v))); +} + +static int oci_unsupported(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Unsupported OCI element '%s' of type '%s'.", name, json_variant_type_to_string(json_variant_type(v))); +} + +static int oci_terminal(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + + /* If not specified, or set to true, we'll default to either an interactive or a read-only + * console. If specified as false, we'll forcibly move to "pipe" mode though. */ + s->console_mode = json_variant_boolean(v) ? _CONSOLE_MODE_INVALID : CONSOLE_PIPE; + return 0; +} + +static int oci_console_dimension(const char *name, JsonVariant *variant, JsonDispatchFlags flags, void *userdata) { + unsigned *u = userdata; + uintmax_t k; + + assert(u); + + k = json_variant_unsigned(variant); + if (k == 0) + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "Console size field '%s' is too small.", strna(name)); + if (k > USHRT_MAX) /* TIOCSWINSZ's struct winsize uses "unsigned short" for width and height */ + return json_log(variant, flags, SYNTHETIC_ERRNO(ERANGE), + "Console size field '%s' is too large.", strna(name)); + + *u = (unsigned) k; + return 0; +} + +static int oci_console_size(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "height", JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_height), JSON_MANDATORY }, + { "width", JSON_VARIANT_UNSIGNED, oci_console_dimension, offsetof(Settings, console_width), JSON_MANDATORY }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_absolute_path(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + char **p = userdata; + const char *n; + + assert(p); + + n = json_variant_string(v); + + if (!path_is_absolute(n)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path in JSON field '%s' is not absolute: %s", strna(name), n); + + return free_and_strdup_warn(p, n); +} + +static int oci_env(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + char ***l = userdata; + JsonVariant *e; + int r; + + assert(l); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Environment array contains non-string."); + + assert_se(n = json_variant_string(e)); + + if (!env_assignment_is_valid(n)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Environment assignment not valid: %s", n); + + r = strv_extend(l, n); + if (r < 0) + return log_oom(); + } + + return 0; +} + +static int oci_args(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_strv_free_ char **l = NULL; + char ***value = userdata; + int r; + + assert(value); + + r = json_variant_strv(v, &l); + if (r < 0) + return json_log(v, flags, r, "Cannot parse arguments as list of strings: %m"); + + if (strv_isempty(l)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Argument list empty, refusing."); + + if (isempty(l[0])) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Executable name is empty, refusing."); + + return strv_free_and_replace(*value, l); +} + +static int oci_rlimit_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + const char *z; + int t, *type = userdata; + + assert_se(type); + + z = startswith(json_variant_string(v), "RLIMIT_"); + if (!z) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimit entry's name does not begin with 'RLIMIT_', refusing: %s", + json_variant_string(v)); + + t = rlimit_from_string(z); + if (t < 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimit name unknown: %s", json_variant_string(v)); + + *type = t; + return 0; +} + +static int oci_rlimit_value(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + rlim_t z, *value = userdata; + + assert(value); + + if (json_variant_is_negative(v)) + z = RLIM_INFINITY; + else { + if (!json_variant_is_unsigned(v)) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "rlimits limit not unsigned, refusing."); + + z = (rlim_t) json_variant_unsigned(v); + + if ((uintmax_t) z != json_variant_unsigned(v)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimits limit out of range, refusing."); + } + + *value = z; + return 0; +} + +static int oci_rlimits(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct rlimit_data { + int type; + rlim_t soft; + rlim_t hard; + } data = { + .type = -1, + .soft = RLIM_INFINITY, + .hard = RLIM_INFINITY, + }; + + static const JsonDispatch table[] = { + { "soft", JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, soft), JSON_MANDATORY }, + { "hard", JSON_VARIANT_NUMBER, oci_rlimit_value, offsetof(struct rlimit_data, hard), JSON_MANDATORY }, + { "type", JSON_VARIANT_STRING, oci_rlimit_type, offsetof(struct rlimit_data, type), JSON_MANDATORY }, + {} + }; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + assert(data.type >= 0); + assert(data.type < _RLIMIT_MAX); + + if (s->rlimit[data.type]) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "rlimits array contains duplicate entry, refusing."); + + s->rlimit[data.type] = new(struct rlimit, 1); + if (!s->rlimit[data.type]) + return log_oom(); + + *s->rlimit[data.type] = (struct rlimit) { + .rlim_cur = data.soft, + .rlim_max = data.hard, + }; + + } + return 0; +} + +static int oci_capability_array(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *mask = userdata, m = 0; + JsonVariant *e; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + const char *n; + int cap; + + if (!json_variant_is_string(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Entry in capabilities array is not a string."); + + assert_se(n = json_variant_string(e)); + + cap = capability_from_name(n); + if (cap < 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown capability: %s", n); + + m |= UINT64_C(1) << cap; + } + + if (*mask == (uint64_t) -1) + *mask = m; + else + *mask |= m; + + return 0; +} + +static int oci_capabilities(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "effective", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, effective) }, + { "bounding", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, bounding) }, + { "inheritable", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, inheritable) }, + { "permitted", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, permitted) }, + { "ambient", JSON_VARIANT_ARRAY, oci_capability_array, offsetof(CapabilityQuintet, ambient) }, + {} + }; + + Settings *s = userdata; + int r; + + assert(s); + + r = json_dispatch(v, table, oci_unexpected, flags, &s->full_capabilities); + if (r < 0) + return r; + + if (s->full_capabilities.bounding != (uint64_t) -1) { + s->capability = s->full_capabilities.bounding; + s->drop_capability = ~s->full_capabilities.bounding; + } + + return 0; +} + +static int oci_oom_score_adj(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + intmax_t k; + + assert(s); + + k = json_variant_integer(v); + if (k < OOM_SCORE_ADJ_MIN || k > OOM_SCORE_ADJ_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "oomScoreAdj value out of range: %ji", k); + + s->oom_score_adjust = (int) k; + s->oom_score_adjust_set = true; + + return 0; +} + +static int oci_uid_gid(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = userdata, u; + uintmax_t k; + + assert(uid); + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + k = json_variant_unsigned(v); + u = (uid_t) k; + if ((uintmax_t) u != k) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "UID/GID out of range: %ji", k); + + if (!uid_is_valid(u)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "UID/GID is not valid: " UID_FMT, u); + + *uid = u; + return 0; +} + +static int oci_supplementary_gids(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + gid_t gid, *a; + + if (!json_variant_is_unsigned(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Supplementary GID entry is not a UID."); + + r = oci_uid_gid(name, e, flags, &gid); + if (r < 0) + return r; + + a = reallocarray(s->supplementary_gids, s->n_supplementary_gids + 1, sizeof(gid_t)); + if (!a) + return log_oom(); + + s->supplementary_gids = a; + s->supplementary_gids[s->n_supplementary_gids++] = gid; + } + + return 0; +} + +static int oci_user(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + static const JsonDispatch table[] = { + { "uid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(Settings, uid), JSON_MANDATORY }, + { "gid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(Settings, gid), JSON_MANDATORY }, + { "additionalGids", JSON_VARIANT_ARRAY, oci_supplementary_gids, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_process(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "terminal", JSON_VARIANT_BOOLEAN, oci_terminal, 0, 0 }, + { "consoleSize", JSON_VARIANT_OBJECT, oci_console_size, 0, 0 }, + { "cwd", JSON_VARIANT_STRING, oci_absolute_path, offsetof(Settings, working_directory), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(Settings, environment), 0 }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(Settings, parameters), 0 }, + { "rlimits", JSON_VARIANT_ARRAY, oci_rlimits, 0, 0 }, + { "apparmorProfile", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "capabilities", JSON_VARIANT_OBJECT, oci_capabilities, 0, 0 }, + { "noNewPrivileges", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, no_new_privileges), 0 }, + { "oomScoreAdj", JSON_VARIANT_INTEGER, oci_oom_score_adj, 0, 0 }, + { "selinuxLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + { "user", JSON_VARIANT_OBJECT, oci_user, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_root(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(Settings, root) }, + { "readonly", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(Settings, read_only) }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_hostname(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + const char *n; + + assert(s); + + assert_se(n = json_variant_string(v)); + + if (!hostname_is_valid(n, false)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Hostname string is not a valid hostname: %s", n); + + return free_and_strdup_warn(&s->hostname, n); +} + +static bool oci_exclude_mount(const char *path) { + + /* Returns "true" for all mounts we insist to mount on our own, and hence ignore the OCI data. */ + + if (PATH_IN_SET(path, + "/dev", + "/dev/mqueue", + "/dev/pts", + "/dev/shm", + "/proc", + "/proc/acpi", + "/proc/apm", + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/kallsyms", + "/proc/kcore", + "/proc/keys", + "/proc/scsi", + "/proc/sys", + "/proc/sys/net", + "/proc/sysrq-trigger", + "/proc/timer_list", + "/run", + "/sys", + "/sys", + "/sys/fs/selinux", + "/tmp")) + return true; + + /* Similar, skip the whole /sys/fs/cgroups subtree */ + if (path_startswith(path, "/sys/fs/cgroup")) + return true; + + return false; +} + +typedef struct oci_mount_data { + char *destination; + char *source; + char *type; + char **options; +} oci_mount_data; + +static void cleanup_oci_mount_data(oci_mount_data *data) { + free(data->destination); + free(data->source); + strv_free(data->options); + free(data->type); +} + +static int oci_mounts(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const JsonDispatch table[] = { + { "destination", JSON_VARIANT_STRING, oci_absolute_path, offsetof(oci_mount_data, destination), JSON_MANDATORY }, + { "source", JSON_VARIANT_STRING, json_dispatch_string, offsetof(oci_mount_data, source), 0 }, + { "options", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(oci_mount_data, options), 0, }, + { "type", JSON_VARIANT_STRING, json_dispatch_string, offsetof(oci_mount_data, type), 0 }, + {} + }; + + _cleanup_free_ char *joined_options = NULL; + CustomMount *m; + _cleanup_(cleanup_oci_mount_data) oci_mount_data data = {}; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (!path_is_absolute(data.destination)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Mount destination not an absolute path: %s", data.destination); + + if (oci_exclude_mount(data.destination)) + continue; + + if (data.options) { + joined_options = strv_join(data.options, ","); + if (!joined_options) + return log_oom(); + } + + if (!data.type || streq(data.type, "bind")) { + if (data.source && !path_is_absolute(data.source)) { + char *joined; + + joined = path_join(s->bundle, data.source); + if (!joined) + return log_oom(); + + free_and_replace(data.source, joined); + } + + data.type = mfree(data.type); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND); + } else + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_ARBITRARY); + if (!m) + return log_oom(); + + m->destination = TAKE_PTR(data.destination); + m->source = TAKE_PTR(data.source); + m->options = TAKE_PTR(joined_options); + m->type_argument = TAKE_PTR(data.type); + } + + return 0; +} + +static int oci_namespace_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned long *nsflags = userdata; + const char *n; + + assert(nsflags); + assert_se(n = json_variant_string(v)); + + /* We don't use namespace_flags_from_string() here, as the OCI spec uses slightly different names than the + * kernel here. */ + if (streq(n, "pid")) + *nsflags = CLONE_NEWPID; + else if (streq(n, "network")) + *nsflags = CLONE_NEWNET; + else if (streq(n, "mount")) + *nsflags = CLONE_NEWNS; + else if (streq(n, "ipc")) + *nsflags = CLONE_NEWIPC; + else if (streq(n, "uts")) + *nsflags = CLONE_NEWUTS; + else if (streq(n, "user")) + *nsflags = CLONE_NEWUSER; + else if (streq(n, "cgroup")) + *nsflags = CLONE_NEWCGROUP; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown cgroup type, refusing: %s", n); + + return 0; +} + +static int oci_namespaces(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + unsigned long n = 0; + JsonVariant *e; + int r; + + assert_se(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct namespace_data { + unsigned long type; + char *path; + } data = {}; + + static const JsonDispatch table[] = { + { "type", JSON_VARIANT_STRING, oci_namespace_type, offsetof(struct namespace_data, type), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(struct namespace_data, path), 0 }, + {} + }; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) { + free(data.path); + return r; + } + + if (data.path) { + if (data.type != CLONE_NEWNET) { + free(data.path); + return json_log(e, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Specifying namespace path for non-network namespace is not supported."); + } + + if (s->network_namespace_path) { + free(data.path); + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Network namespace path specified more than once, refusing."); + } + + free(s->network_namespace_path); + s->network_namespace_path = data.path; + } + + if (FLAGS_SET(n, data.type)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Duplicate namespace specification, refusing."); + + n |= data.type; + } + + if (!FLAGS_SET(n, CLONE_NEWNS)) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Containers without file system namespace aren't supported."); + + s->private_network = FLAGS_SET(n, CLONE_NEWNET); + s->userns_mode = FLAGS_SET(n, CLONE_NEWUSER) ? USER_NAMESPACE_FIXED : USER_NAMESPACE_NO; + s->use_cgns = FLAGS_SET(n, CLONE_NEWCGROUP); + + s->clone_ns_flags = n & (CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS); + + return 0; +} + +static int oci_uid_gid_range(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uid_t *uid = userdata, u; + uintmax_t k; + + assert(uid); + assert_cc(sizeof(uid_t) == sizeof(gid_t)); + + /* This is very much like oci_uid_gid(), except the checks are a bit different, as this is a UID range rather + * than a specific UID, and hence (uid_t) -1 has no special significance. OTOH a range of zero makes no + * sense. */ + + k = json_variant_unsigned(v); + u = (uid_t) k; + if ((uintmax_t) u != k) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "UID/GID out of range: %ji", k); + if (u == 0) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "UID/GID range can't be zero."); + + *uid = u; + return 0; +} + +static int oci_uid_gid_mappings(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct mapping_data { + uid_t host_id; + uid_t container_id; + uid_t range; + } data = { + .host_id = UID_INVALID, + .container_id = UID_INVALID, + .range = 0, + }; + + static const JsonDispatch table[] = { + { "containerID", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(struct mapping_data, container_id), JSON_MANDATORY }, + { "hostID", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(struct mapping_data, host_id), JSON_MANDATORY }, + { "size", JSON_VARIANT_UNSIGNED, oci_uid_gid_range, offsetof(struct mapping_data, range), JSON_MANDATORY }, + {} + }; + + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + if (json_variant_elements(v) == 0) + return 0; + + if (json_variant_elements(v) > 1) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "UID/GID mappings with more than one entry are not supported."); + + assert_se(e = json_variant_by_index(v, 0)); + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.host_id + data.range < data.host_id || + data.container_id + data.range < data.container_id) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "UID/GID range goes beyond UID/GID validity range, refusing."); + + if (data.container_id != 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "UID/GID mappings with a non-zero container base are not supported."); + + if (data.range < 0x10000) + json_log(v, flags|JSON_WARNING, 0, + "UID/GID mapping with less than 65536 UID/GIDS set up, you are looking for trouble."); + + if (s->uid_range != UID_INVALID && + (s->uid_shift != data.host_id || s->uid_range != data.range)) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Non-matching UID and GID mappings are not supported."); + + s->uid_shift = data.host_id; + s->uid_range = data.range; + + return 0; +} + +static int oci_device_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = userdata; + const char *t; + + assert(mode); + assert_se(t = json_variant_string(v)); + + if (STR_IN_SET(t, "c", "u")) + *mode = (*mode & ~S_IFMT) | S_IFCHR; + else if (streq(t, "b")) + *mode = (*mode & ~S_IFMT) | S_IFBLK; + else if (streq(t, "p")) + *mode = (*mode & ~S_IFMT) | S_IFIFO; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown device type: %s", t); + + return 0; +} + +static int oci_device_major(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned *u = userdata; + uintmax_t k; + + assert_se(u); + + k = json_variant_unsigned(v); + if (!DEVICE_MAJOR_VALID(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Device major %ji out of range.", k); + + *u = (unsigned) k; + return 0; +} + +static int oci_device_minor(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + unsigned *u = userdata; + uintmax_t k; + + assert_se(u); + + k = json_variant_unsigned(v); + if (!DEVICE_MINOR_VALID(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Device minor %ji out of range.", k); + + *u = (unsigned) k; + return 0; +} + +static int oci_device_file_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = userdata, m; + uintmax_t k; + + assert(mode); + + k = json_variant_unsigned(v); + m = (mode_t) k; + + if ((m & ~07777) != 0 || (uintmax_t) m != k) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "fileMode out of range, refusing."); + + *mode = m; + return 0; +} + +static int oci_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + static const JsonDispatch table[] = { + { "type", JSON_VARIANT_STRING, oci_device_type, offsetof(DeviceNode, mode), JSON_MANDATORY }, + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(DeviceNode, path), JSON_MANDATORY }, + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(DeviceNode, major), 0 }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(DeviceNode, minor), 0 }, + { "fileMode", JSON_VARIANT_UNSIGNED, oci_device_file_mode, offsetof(DeviceNode, mode), 0 }, + { "uid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(DeviceNode, uid), 0 }, + { "gid", JSON_VARIANT_UNSIGNED, oci_uid_gid, offsetof(DeviceNode, gid), 0 }, + {} + }; + + DeviceNode *node, *nodes; + + nodes = reallocarray(s->extra_nodes, s->n_extra_nodes + 1, sizeof(DeviceNode)); + if (!nodes) + return log_oom(); + + s->extra_nodes = nodes; + + node = nodes + s->n_extra_nodes; + *node = (DeviceNode) { + .uid = UID_INVALID, + .gid = GID_INVALID, + .major = (unsigned) -1, + .minor = (unsigned) -1, + .mode = 0644, + }; + + r = json_dispatch(e, table, oci_unexpected, flags, node); + if (r < 0) + goto fail_element; + + if (S_ISCHR(node->mode) || S_ISBLK(node->mode)) { + _cleanup_free_ char *path = NULL; + + if (node->major == (unsigned) -1 || node->minor == (unsigned) -1) { + r = json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Major/minor required when device node is device node"); + goto fail_element; + } + + /* Suppress a couple of implicit device nodes */ + r = device_path_make_canonical(node->mode, makedev(node->major, node->minor), &path); + if (r < 0) + json_log(e, flags|JSON_DEBUG, 0, "Failed to resolve device node %u:%u, ignoring: %m", node->major, node->minor); + else { + if (PATH_IN_SET(path, + "/dev/null", + "/dev/zero", + "/dev/full", + "/dev/random", + "/dev/urandom", + "/dev/tty", + "/dev/net/tun", + "/dev/ptmx", + "/dev/pts/ptmx", + "/dev/console")) { + + json_log(e, flags|JSON_DEBUG, 0, "Ignoring devices item for device '%s', as it is implicitly created anyway.", path); + free(node->path); + continue; + } + } + } + + s->n_extra_nodes++; + continue; + + fail_element: + free(node->path); + return r; + } + + return 0; +} + +static int oci_cgroups_path(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + _cleanup_free_ char *slice = NULL, *backwards = NULL; + Settings *s = userdata; + const char *p; + int r; + + assert(s); + + assert_se(p = json_variant_string(v)); + + r = cg_path_get_slice(p, &slice); + if (r < 0) + return json_log(v, flags, r, "Couldn't derive slice unit name from path '%s': %m", p); + + r = cg_slice_to_path(slice, &backwards); + if (r < 0) + return json_log(v, flags, r, "Couldn't convert slice unit name '%s' back to path: %m", slice); + + if (!path_equal(backwards, p)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Control group path '%s' does not refer to slice unit, refusing.", p); + + free_and_replace(s->slice, slice); + return 0; +} + +static int oci_cgroup_device_type(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + mode_t *mode = userdata; + const char *n; + + assert_se(n = json_variant_string(v)); + + if (streq(n, "c")) + *mode = S_IFCHR; + else if (streq(n, "b")) + *mode = S_IFBLK; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Control group device type unknown: %s", n); + + return 0; +} + +struct device_data { + bool allow; + bool r; + bool w; + bool m; + mode_t type; + unsigned major; + unsigned minor; +}; + +static int oci_cgroup_device_access(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct device_data *d = userdata; + bool r = false, w = false, m = false; + const char *s; + size_t i; + + assert_se(s = json_variant_string(v)); + + for (i = 0; s[i]; i++) + if (s[i] == 'r') + r = true; + else if (s[i] == 'w') + w = true; + else if (s[i] == 'm') + m = true; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Unknown device access character '%c'.", s[i]); + + d->r = r; + d->w = w; + d->m = m; + + return 0; +} + +static int oci_cgroup_devices(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + _cleanup_free_ struct device_data *list = NULL; + Settings *s = userdata; + size_t n_list = 0, i; + bool noop = false; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + struct device_data data = { + .major = (unsigned) -1, + .minor = (unsigned) -1, + }, *a; + + static const JsonDispatch table[] = { + { "allow", JSON_VARIANT_BOOLEAN, json_dispatch_boolean, offsetof(struct device_data, allow), JSON_MANDATORY }, + { "type", JSON_VARIANT_STRING, oci_cgroup_device_type, offsetof(struct device_data, type), 0 }, + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), 0 }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), 0 }, + { "access", JSON_VARIANT_STRING, oci_cgroup_device_access, 0, 0 }, + {} + }; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (!data.allow) { + /* The fact that OCI allows 'deny' entries makes really no sense, as 'allow' + * vs. 'deny' for the devices cgroup controller is really not about allow-listing and + * deny-listing but about adding and removing entries from the allow list. Since we + * always start out with an empty allow list we hence ignore the whole thing, as + * removing entries which don't exist make no sense. We'll log about this, since this + * is really borked in the spec, with one exception: the entry that's supposed to + * drop the kernel's default we ignore silently */ + + if (!data.r || !data.w || !data.m || data.type != 0 || data.major != (unsigned) -1 || data.minor != (unsigned) -1) + json_log(v, flags|JSON_WARNING, 0, "Devices cgroup allow list with arbitrary 'allow' entries not supported, ignoring."); + + /* We ignore the 'deny' entry as for us that's implied */ + continue; + } + + if (!data.r && !data.w && !data.m) { + json_log(v, flags|LOG_WARNING, 0, "Device cgroup allow list entry with no effect found, ignoring."); + continue; + } + + if (data.minor != (unsigned) -1 && data.major == (unsigned) -1) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Device cgroup allow list entries with minors but no majors not supported."); + + if (data.major != (unsigned) -1 && data.type == 0) + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Device cgroup allow list entries with majors but no device node type not supported."); + + if (data.type == 0) { + if (data.r && data.w && data.m) /* a catchall allow list entry means we are looking at a noop */ + noop = true; + else + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), + "Device cgroup allow list entries with no type not supported."); + } + + a = reallocarray(list, n_list + 1, sizeof(struct device_data)); + if (!a) + return log_oom(); + + list = a; + list[n_list++] = data; + } + + if (noop) + return 0; + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_open_container(s->properties, 'r', "sv"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_append(s->properties, "s", "DeviceAllow"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(s->properties, 'v', "a(ss)"); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_open_container(s->properties, 'a', "(ss)"); + if (r < 0) + return bus_log_create_error(r); + + for (i = 0; i < n_list; i++) { + _cleanup_free_ char *pattern = NULL; + char access[4]; + size_t n = 0; + + if (list[i].minor == (unsigned) -1) { + const char *t; + + if (list[i].type == S_IFBLK) + t = "block"; + else { + assert(list[i].type == S_IFCHR); + t = "char"; + } + + if (list[i].major == (unsigned) -1) { + pattern = strjoin(t, "-*"); + if (!pattern) + return log_oom(); + } else { + if (asprintf(&pattern, "%s-%u", t, list[i].major) < 0) + return log_oom(); + } + + } else { + assert(list[i].major != (unsigned) -1); /* If a minor is specified, then a major also needs to be specified */ + + r = device_path_make_major_minor(list[i].type, makedev(list[i].major, list[i].minor), &pattern); + if (r < 0) + return log_oom(); + } + + if (list[i].r) + access[n++] = 'r'; + if (list[i].w) + access[n++] = 'w'; + if (list[i].m) + access[n++] = 'm'; + access[n] = 0; + + assert(n > 0); + + r = sd_bus_message_append(s->properties, "(ss)", pattern, access); + if (r < 0) + return bus_log_create_error(r); + } + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + r = sd_bus_message_close_container(s->properties); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_cgroup_memory_limit(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *m = userdata; + uintmax_t k; + + assert(m); + + if (json_variant_is_negative(v)) { + *m = UINT64_MAX; + return 0; + } + + if (!json_variant_is_unsigned(v)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Memory limit is not an unsigned integer"); + + k = json_variant_unsigned(v); + if (k >= UINT64_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Memory limit too large: %ji", k); + + *m = (uint64_t) k; + return 0; +} + +static int oci_cgroup_memory(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + struct memory_data { + uint64_t limit; + uint64_t reservation; + uint64_t swap; + } data = { + .limit = UINT64_MAX, + .reservation = UINT64_MAX, + .swap = UINT64_MAX, + }; + + static const JsonDispatch table[] = { + { "limit", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, limit), 0 }, + { "reservation", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, reservation), 0 }, + { "swap", JSON_VARIANT_NUMBER, oci_cgroup_memory_limit, offsetof(struct memory_data, swap), 0 }, + { "kernel", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "kernelTCP", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "swapiness", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + { "disableOOMKiller", JSON_VARIANT_NUMBER, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + Settings *s = userdata; + int r; + + r = json_dispatch(v, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.swap != UINT64_MAX) { + if (data.limit == UINT64_MAX) + json_log(v, flags|LOG_WARNING, 0, "swap limit without memory limit is not supported, ignoring."); + else if (data.swap < data.limit) + json_log(v, flags|LOG_WARNING, 0, "swap limit is below memory limit, ignoring."); + else { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemorySwapMax", "t", data.swap - data.limit); + if (r < 0) + return bus_log_create_error(r); + } + } + + if (data.limit != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemoryMax", "t", data.limit); + if (r < 0) + return bus_log_create_error(r); + } + + if (data.reservation != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "MemoryLow", "t", data.reservation); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +struct cpu_data { + uint64_t shares; + uint64_t quota; + uint64_t period; + CPUSet cpu_set; +}; + +static int oci_cgroup_cpu_shares(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = userdata; + uintmax_t k; + + assert(u); + + k = json_variant_unsigned(v); + if (k < CGROUP_CPU_SHARES_MIN || k > CGROUP_CPU_SHARES_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "shares value out of range."); + + *u = (uint64_t) k; + return 0; +} + +static int oci_cgroup_cpu_quota(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint64_t *u = userdata; + uintmax_t k; + + assert(u); + + k = json_variant_unsigned(v); + if (k <= 0 || k >= UINT64_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "period/quota value out of range."); + + *u = (uint64_t) k; + return 0; +} + +static int oci_cgroup_cpu_cpus(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct cpu_data *data = userdata; + CPUSet set; + const char *n; + int r; + + assert(data); + + assert_se(n = json_variant_string(v)); + + r = parse_cpu_set(n, &set); + if (r < 0) + return json_log(v, flags, r, "Failed to parse CPU set specification: %s", n); + + cpu_set_reset(&data->cpu_set); + data->cpu_set = set; + + return 0; +} + +static int oci_cgroup_cpu(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "shares", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_shares, offsetof(struct cpu_data, shares), 0 }, + { "quota", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, quota), 0 }, + { "period", JSON_VARIANT_UNSIGNED, oci_cgroup_cpu_quota, offsetof(struct cpu_data, period), 0 }, + { "realtimeRuntime", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 }, + { "realtimePeriod", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, 0 }, + { "cpus", JSON_VARIANT_STRING, oci_cgroup_cpu_cpus, 0, 0 }, + { "mems", JSON_VARIANT_STRING, oci_unsupported, 0, 0 }, + {} + }; + + struct cpu_data data = { + .shares = UINT64_MAX, + .quota = UINT64_MAX, + .period = UINT64_MAX, + }; + + Settings *s = userdata; + int r; + + r = json_dispatch(v, table, oci_unexpected, flags, &data); + if (r < 0) { + cpu_set_reset(&data.cpu_set); + return r; + } + + cpu_set_reset(&s->cpu_set); + s->cpu_set = data.cpu_set; + + if (data.shares != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "CPUShares", "t", data.shares); + if (r < 0) + return bus_log_create_error(r); + } + + if (data.quota != UINT64_MAX && data.period != UINT64_MAX) { + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "CPUQuotaPerSecUSec", "t", (uint64_t) (data.quota * USEC_PER_SEC / data.period)); + if (r < 0) + return bus_log_create_error(r); + + } else if ((data.quota != UINT64_MAX) != (data.period != UINT64_MAX)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "CPU quota and period not used together."); + + return 0; +} + +static int oci_cgroup_block_io_weight(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + uintmax_t k; + int r; + + assert(s); + + k = json_variant_unsigned(v); + if (k < CGROUP_BLKIO_WEIGHT_MIN || k > CGROUP_BLKIO_WEIGHT_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Block I/O weight out of range."); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "BlockIOWeight", "t", (uint64_t) k); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_cgroup_block_io_weight_device(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + struct device_data { + unsigned major; + unsigned minor; + uintmax_t weight; + } data = { + .major = (unsigned) -1, + .minor = (unsigned) -1, + .weight = UINTMAX_MAX, + }; + + static const JsonDispatch table[] = { + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), JSON_MANDATORY }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), JSON_MANDATORY }, + { "weight", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(struct device_data, weight), 0 }, + { "leafWeight", JSON_VARIANT_INTEGER, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + _cleanup_free_ char *path = NULL; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.weight == UINTMAX_MAX) + continue; + + if (data.weight < CGROUP_BLKIO_WEIGHT_MIN || data.weight > CGROUP_BLKIO_WEIGHT_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Block I/O device weight out of range."); + + r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path); + if (r < 0) + return json_log(v, flags, r, "Failed to build device path: %m"); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "BlockIODeviceWeight", "a(st)", 1, path, (uint64_t) data.weight); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int oci_cgroup_block_io_throttle(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + const char *pname; + JsonVariant *e; + int r; + + assert(s); + + pname = streq(name, "throttleReadBpsDevice") ? "IOReadBandwidthMax" : + streq(name, "throttleWriteBpsDevice") ? "IOWriteBandwidthMax" : + streq(name, "throttleReadIOPSDevice") ? "IOReadIOPSMax" : + "IOWriteIOPSMax"; + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + struct device_data { + unsigned major; + unsigned minor; + uintmax_t rate; + } data = { + .major = (unsigned) -1, + .minor = (unsigned) -1, + }; + + static const JsonDispatch table[] = { + { "major", JSON_VARIANT_UNSIGNED, oci_device_major, offsetof(struct device_data, major), JSON_MANDATORY }, + { "minor", JSON_VARIANT_UNSIGNED, oci_device_minor, offsetof(struct device_data, minor), JSON_MANDATORY }, + { "rate", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(struct device_data, rate), JSON_MANDATORY }, + {} + }; + + _cleanup_free_ char *path = NULL; + + r = json_dispatch(e, table, oci_unexpected, flags, &data); + if (r < 0) + return r; + + if (data.rate >= UINT64_MAX) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Block I/O device rate out of range."); + + r = device_path_make_major_minor(S_IFBLK, makedev(data.major, data.minor), &path); + if (r < 0) + return json_log(v, flags, r, "Failed to build device path: %m"); + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", pname, "a(st)", 1, path, (uint64_t) data.rate); + if (r < 0) + return bus_log_create_error(r); + } + + return 0; +} + +static int oci_cgroup_block_io(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "weight", JSON_VARIANT_UNSIGNED, oci_cgroup_block_io_weight, 0, 0 }, + { "leafWeight", JSON_VARIANT_UNSIGNED, oci_unsupported, 0, JSON_PERMISSIVE }, + { "weightDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_weight_device, 0, 0 }, + { "throttleReadBpsDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleWriteBpsDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleReadIOPSDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + { "throttleWriteIOPSDevice", JSON_VARIANT_ARRAY, oci_cgroup_block_io_throttle, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_cgroup_pids(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "limit", JSON_VARIANT_NUMBER, json_dispatch_variant, 0, JSON_MANDATORY }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *k = NULL; + Settings *s = userdata; + uint64_t m; + int r; + + assert(s); + + r = json_dispatch(v, table, oci_unexpected, flags, &k); + if (r < 0) + return r; + + if (json_variant_is_negative(k)) + m = UINT64_MAX; + else { + if (!json_variant_is_unsigned(k)) + return json_log(k, flags, SYNTHETIC_ERRNO(EINVAL), + "pids limit not unsigned integer, refusing."); + + m = (uint64_t) json_variant_unsigned(k); + + if ((uintmax_t) m != json_variant_unsigned(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "pids limit out of range, refusing."); + } + + r = settings_allocate_properties(s); + if (r < 0) + return r; + + r = sd_bus_message_append(s->properties, "(sv)", "TasksMax", "t", m); + if (r < 0) + return bus_log_create_error(r); + + return 0; +} + +static int oci_resources(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "devices", JSON_VARIANT_ARRAY, oci_cgroup_devices, 0, 0 }, + { "memory", JSON_VARIANT_OBJECT, oci_cgroup_memory, 0, 0 }, + { "cpu", JSON_VARIANT_OBJECT, oci_cgroup_cpu, 0, 0 }, + { "blockIO", JSON_VARIANT_OBJECT, oci_cgroup_block_io, 0, 0 }, + { "hugepageLimits", JSON_VARIANT_ARRAY, oci_unsupported, 0, 0 }, + { "network", JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 }, + { "pids", JSON_VARIANT_OBJECT, oci_cgroup_pids, 0, 0 }, + { "rdma", JSON_VARIANT_OBJECT, oci_unsupported, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static bool sysctl_key_valid(const char *s) { + bool dot = true; + + /* Note that we are a bit stricter here than in systemd-sysctl, as that inherited semantics from the old sysctl + * tool, which were really weird (as it swaps / and . in both ways) */ + + if (isempty(s)) + return false; + + for (; *s; s++) { + + if (*s <= ' ' || *s >= 127) + return false; + if (*s == '/') + return false; + if (*s == '.') { + + if (dot) /* Don't allow two dots next to each other (or at the beginning) */ + return false; + + dot = true; + } else + dot = false; + } + + if (dot) /* don't allow a dot at the end */ + return false; + + return true; +} + +static int oci_sysctl(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *w; + const char *k; + int r; + + assert(s); + + JSON_VARIANT_OBJECT_FOREACH(k, w, v) { + const char *m; + + if (!json_variant_is_string(w)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "sysctl parameter is not a string, refusing."); + + assert_se(m = json_variant_string(w)); + + if (sysctl_key_valid(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "sysctl key invalid, refusing: %s", k); + + r = strv_extend_strv(&s->sysctl, STRV_MAKE(k, m), false); + if (r < 0) + return log_oom(); + } + + return 0; +} + +#if HAVE_SECCOMP +static int oci_seccomp_action_from_string(const char *name, uint32_t *ret) { + + static const struct { + const char *name; + uint32_t action; + } table[] = { + { "SCMP_ACT_ALLOW", SCMP_ACT_ALLOW }, + { "SCMP_ACT_ERRNO", SCMP_ACT_ERRNO(EPERM) }, /* the OCI spec doesn't document the error, but it appears EPERM is supposed to be used */ + { "SCMP_ACT_KILL", SCMP_ACT_KILL }, +#ifdef SCMP_ACT_KILL_PROCESS + { "SCMP_ACT_KILL_PROCESS", SCMP_ACT_KILL_PROCESS }, +#endif +#ifdef SCMP_ACT_KILL_THREAD + { "SCMP_ACT_KILL_THREAD", SCMP_ACT_KILL_THREAD }, +#endif +#ifdef SCMP_ACT_LOG + { "SCMP_ACT_LOG", SCMP_ACT_LOG }, +#endif + { "SCMP_ACT_TRAP", SCMP_ACT_TRAP }, + + /* We don't support SCMP_ACT_TRACE because that requires a tracer, and that doesn't really make sense + * here */ + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(name, table[i].name)) { + *ret = table[i].action; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_arch_from_string(const char *name, uint32_t *ret) { + + static const struct { + const char *name; + uint32_t arch; + } table[] = { + { "SCMP_ARCH_AARCH64", SCMP_ARCH_AARCH64 }, + { "SCMP_ARCH_ARM", SCMP_ARCH_ARM }, + { "SCMP_ARCH_MIPS", SCMP_ARCH_MIPS }, + { "SCMP_ARCH_MIPS64", SCMP_ARCH_MIPS64 }, + { "SCMP_ARCH_MIPS64N32", SCMP_ARCH_MIPS64N32 }, + { "SCMP_ARCH_MIPSEL", SCMP_ARCH_MIPSEL }, + { "SCMP_ARCH_MIPSEL64", SCMP_ARCH_MIPSEL64 }, + { "SCMP_ARCH_MIPSEL64N32", SCMP_ARCH_MIPSEL64N32 }, + { "SCMP_ARCH_NATIVE", SCMP_ARCH_NATIVE }, +#ifdef SCMP_ARCH_PARISC + { "SCMP_ARCH_PARISC", SCMP_ARCH_PARISC }, +#endif +#ifdef SCMP_ARCH_PARISC64 + { "SCMP_ARCH_PARISC64", SCMP_ARCH_PARISC64 }, +#endif + { "SCMP_ARCH_PPC", SCMP_ARCH_PPC }, + { "SCMP_ARCH_PPC64", SCMP_ARCH_PPC64 }, + { "SCMP_ARCH_PPC64LE", SCMP_ARCH_PPC64LE }, +#ifdef SCMP_ARCH_RISCV64 + { "SCMP_ARCH_RISCV64", SCMP_ARCH_RISCV64 }, +#endif + { "SCMP_ARCH_S390", SCMP_ARCH_S390 }, + { "SCMP_ARCH_S390X", SCMP_ARCH_S390X }, + { "SCMP_ARCH_X32", SCMP_ARCH_X32 }, + { "SCMP_ARCH_X86", SCMP_ARCH_X86 }, + { "SCMP_ARCH_X86_64", SCMP_ARCH_X86_64 }, + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(table[i].name, name)) { + *ret = table[i].arch; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_compare_from_string(const char *name, enum scmp_compare *ret) { + + static const struct { + const char *name; + enum scmp_compare op; + } table[] = { + { "SCMP_CMP_NE", SCMP_CMP_NE }, + { "SCMP_CMP_LT", SCMP_CMP_LT }, + { "SCMP_CMP_LE", SCMP_CMP_LE }, + { "SCMP_CMP_EQ", SCMP_CMP_EQ }, + { "SCMP_CMP_GE", SCMP_CMP_GE }, + { "SCMP_CMP_GT", SCMP_CMP_GT }, + { "SCMP_CMP_MASKED_EQ", SCMP_CMP_MASKED_EQ }, + }; + + size_t i; + + for (i = 0; i < ELEMENTSOF(table); i++) + if (streq_ptr(table[i].name, name)) { + *ret = table[i].op; + return 0; + } + + return -EINVAL; +} + +static int oci_seccomp_archs(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + scmp_filter_ctx *sc = userdata; + JsonVariant *e; + int r; + + assert(sc); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + uint32_t a; + + if (!json_variant_is_string(e)) + return json_log(e, flags, SYNTHETIC_ERRNO(EINVAL), + "Architecture entry is not a string"); + + r = oci_seccomp_arch_from_string(json_variant_string(e), &a); + if (r < 0) + return json_log(e, flags, r, "Unknown architecture: %s", json_variant_string(e)); + + r = seccomp_arch_add(sc, a); + if (r == -EEXIST) + continue; + if (r < 0) + return json_log(e, flags, r, "Failed to add architecture to seccomp filter: %m"); + } + + return 0; +} + +struct syscall_rule { + char **names; + uint32_t action; + struct scmp_arg_cmp *arguments; + size_t n_arguments; +}; + +static void syscall_rule_free(struct syscall_rule *rule) { + assert(rule); + + strv_free(rule->names); + free(rule->arguments); +}; + +static int oci_seccomp_action(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + uint32_t *action = userdata; + int r; + + assert(action); + + r = oci_seccomp_action_from_string(json_variant_string(v), action); + if (r < 0) + return json_log(v, flags, r, "Unknown system call action '%s': %m", json_variant_string(v)); + + return 0; +} + +static int oci_seccomp_op(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + enum scmp_compare *op = userdata; + int r; + + assert(op); + + r = oci_seccomp_compare_from_string(json_variant_string(v), op); + if (r < 0) + return json_log(v, flags, r, "Unknown seccomp operator '%s': %m", json_variant_string(v)); + + return 0; +} + +static int oci_seccomp_args(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + struct syscall_rule *rule = userdata; + JsonVariant *e; + int r; + + assert(rule); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const struct JsonDispatch table[] = { + { "index", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(struct scmp_arg_cmp, arg), JSON_MANDATORY }, + { "value", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_a), JSON_MANDATORY }, + { "valueTwo", JSON_VARIANT_UNSIGNED, json_dispatch_uint64, offsetof(struct scmp_arg_cmp, datum_b), 0 }, + { "op", JSON_VARIANT_STRING, oci_seccomp_op, offsetof(struct scmp_arg_cmp, op), JSON_MANDATORY }, + {}, + }; + + struct scmp_arg_cmp *a, *p; + int expected; + + a = reallocarray(rule->arguments, rule->n_arguments + 1, sizeof(struct syscall_rule)); + if (!a) + return log_oom(); + + rule->arguments = a; + p = rule->arguments + rule->n_arguments; + + *p = (struct scmp_arg_cmp) { + .arg = 0, + .datum_a = 0, + .datum_b = 0, + .op = 0, + }; + + r = json_dispatch(e, table, oci_unexpected, flags, p); + if (r < 0) + return r; + + expected = p->op == SCMP_CMP_MASKED_EQ ? 4 : 3; + if (r != expected) + json_log(e, flags|JSON_WARNING, 0, "Wrong number of system call arguments for JSON data data, ignoring."); + + /* Note that we are a bit sloppy here and do not insist that SCMP_CMP_MASKED_EQ gets two datum values, + * and the other only one. That's because buildah for example by default calls things with + * SCMP_CMP_MASKED_EQ but only one argument. We use 0 when the value is not specified. */ + + rule->n_arguments++; + } + + return 0; +} + +static int oci_seccomp_syscalls(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + scmp_filter_ctx *sc = userdata; + JsonVariant *e; + int r; + + assert(sc); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + static const JsonDispatch table[] = { + { "names", JSON_VARIANT_ARRAY, json_dispatch_strv, offsetof(struct syscall_rule, names), JSON_MANDATORY }, + { "action", JSON_VARIANT_STRING, oci_seccomp_action, offsetof(struct syscall_rule, action), JSON_MANDATORY }, + { "args", JSON_VARIANT_ARRAY, oci_seccomp_args, 0, 0 }, + }; + struct syscall_rule rule = { + .action = (uint32_t) -1, + }; + char **i; + + r = json_dispatch(e, table, oci_unexpected, flags, &rule); + if (r < 0) + goto fail_rule; + + if (strv_isempty(rule.names)) { + json_log(e, flags, 0, "System call name list is empty."); + r = -EINVAL; + goto fail_rule; + } + + STRV_FOREACH(i, rule.names) { + int nr; + + nr = seccomp_syscall_resolve_name(*i); + if (nr == __NR_SCMP_ERROR) { + log_debug("Unknown syscall %s, skipping.", *i); + continue; + } + + r = seccomp_rule_add_array(sc, rule.action, nr, rule.n_arguments, rule.arguments); + if (r < 0) + goto fail_rule; + } + + syscall_rule_free(&rule); + continue; + + fail_rule: + syscall_rule_free(&rule); + return r; + } + + return 0; +} +#endif + +static int oci_seccomp(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + +#if HAVE_SECCOMP + static const JsonDispatch table[] = { + { "defaultAction", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "architectures", JSON_VARIANT_ARRAY, oci_seccomp_archs, 0, 0 }, + { "syscalls", JSON_VARIANT_ARRAY, oci_seccomp_syscalls, 0, 0 }, + {} + }; + + _cleanup_(seccomp_releasep) scmp_filter_ctx sc = NULL; + Settings *s = userdata; + JsonVariant *def; + uint32_t d; + int r; + + assert(s); + + def = json_variant_by_key(v, "defaultAction"); + if (!def) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), "defaultAction element missing."); + + if (!json_variant_is_string(def)) + return json_log(def, flags, SYNTHETIC_ERRNO(EINVAL), "defaultAction is not a string."); + + r = oci_seccomp_action_from_string(json_variant_string(def), &d); + if (r < 0) + return json_log(def, flags, r, "Unknown default action: %s", json_variant_string(def)); + + sc = seccomp_init(d); + if (!sc) + return json_log(v, flags, SYNTHETIC_ERRNO(ENOMEM), "Couldn't allocate seccomp object."); + + r = json_dispatch(v, table, oci_unexpected, flags, sc); + if (r < 0) + return r; + + seccomp_release(s->seccomp); + s->seccomp = TAKE_PTR(sc); + return 0; +#else + return json_log(v, flags, SYNTHETIC_ERRNO(EOPNOTSUPP), "libseccomp support not enabled, can't parse seccomp object."); +#endif +} + +static int oci_rootfs_propagation(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + const char *s; + + s = json_variant_string(v); + + if (streq(s, "shared")) + return 0; + + json_log(v, flags|JSON_DEBUG, 0, "Ignoring rootfsPropagation setting '%s'.", s); + return 0; +} + +static int oci_masked_paths(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *destination = NULL; + CustomMount *m; + const char *p; + + if (!json_variant_is_string(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not a string, refusing."); + + assert_se(p = json_variant_string(e)); + + if (!path_is_absolute(p)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not not absolute, refusing: %s", p); + + if (oci_exclude_mount(p)) + continue; + + destination = strdup(p); + if (!destination) + return log_oom(); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_INACCESSIBLE); + if (!m) + return log_oom(); + + m->destination = TAKE_PTR(destination); + + /* The spec doesn't say this, but apparently pre-existing implementations are lenient towards + * non-existing paths to mask. Let's hence be too. */ + m->graceful = true; + } + + return 0; +} + +static int oci_readonly_paths(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + _cleanup_free_ char *source = NULL, *destination = NULL; + CustomMount *m; + const char *p; + + if (!json_variant_is_string(e)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not a string, refusing."); + + assert_se(p = json_variant_string(e)); + + if (!path_is_absolute(p)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Path is not not absolute, refusing: %s", p); + + if (oci_exclude_mount(p)) + continue; + + source = strjoin("+", p); + if (!source) + return log_oom(); + + destination = strdup(p); + if (!destination) + return log_oom(); + + m = custom_mount_add(&s->custom_mounts, &s->n_custom_mounts, CUSTOM_MOUNT_BIND); + if (!m) + return log_oom(); + + m->source = TAKE_PTR(source); + m->destination = TAKE_PTR(destination); + m->read_only = true; + } + + return 0; +} + +static int oci_linux(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "namespaces", JSON_VARIANT_ARRAY, oci_namespaces, 0, 0 }, + { "uidMappings", JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 }, + { "gidMappings", JSON_VARIANT_ARRAY, oci_uid_gid_mappings, 0, 0 }, + { "devices", JSON_VARIANT_ARRAY, oci_devices, 0, 0 }, + { "cgroupsPath", JSON_VARIANT_STRING, oci_cgroups_path, 0, 0 }, + { "resources", JSON_VARIANT_OBJECT, oci_resources, 0, 0 }, + { "intelRdt", JSON_VARIANT_OBJECT, oci_unsupported, 0, JSON_PERMISSIVE }, + { "sysctl", JSON_VARIANT_OBJECT, oci_sysctl, 0, 0 }, + { "seccomp", JSON_VARIANT_OBJECT, oci_seccomp, 0, 0 }, + { "rootfsPropagation", JSON_VARIANT_STRING, oci_rootfs_propagation, 0, 0 }, + { "maskedPaths", JSON_VARIANT_ARRAY, oci_masked_paths, 0, 0 }, + { "readonlyPaths", JSON_VARIANT_ARRAY, oci_readonly_paths, 0, 0 }, + { "mountLabel", JSON_VARIANT_STRING, oci_unsupported, 0, JSON_PERMISSIVE }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_hook_timeout(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + usec_t *u = userdata; + uintmax_t k; + + k = json_variant_unsigned(v); + if (k == 0 || k > (UINT64_MAX-1)/USEC_PER_SEC) + return json_log(v, flags, SYNTHETIC_ERRNO(ERANGE), + "Hook timeout value out of range."); + + *u = k * USEC_PER_SEC; + return 0; +} + +static int oci_hooks_array(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + Settings *s = userdata; + JsonVariant *e; + int r; + + assert(s); + + JSON_VARIANT_ARRAY_FOREACH(e, v) { + + static const JsonDispatch table[] = { + { "path", JSON_VARIANT_STRING, oci_absolute_path, offsetof(OciHook, path), JSON_MANDATORY }, + { "args", JSON_VARIANT_ARRAY, oci_args, offsetof(OciHook, args), 0 }, + { "env", JSON_VARIANT_ARRAY, oci_env, offsetof(OciHook, env), 0 }, + { "timeout", JSON_VARIANT_UNSIGNED, oci_hook_timeout, offsetof(OciHook, timeout), 0 }, + {} + }; + + OciHook *a, **array, *new_item; + size_t *n_array; + + if (streq(name, "prestart")) { + array = &s->oci_hooks_prestart; + n_array = &s->n_oci_hooks_prestart; + } else if (streq(name, "poststart")) { + array = &s->oci_hooks_poststart; + n_array = &s->n_oci_hooks_poststart; + } else { + assert(streq(name, "poststop")); + array = &s->oci_hooks_poststop; + n_array = &s->n_oci_hooks_poststop; + } + + a = reallocarray(*array, *n_array + 1, sizeof(OciHook)); + if (!a) + return log_oom(); + + *array = a; + new_item = a + *n_array; + + *new_item = (OciHook) { + .timeout = USEC_INFINITY, + }; + + r = json_dispatch(e, table, oci_unexpected, flags, userdata); + if (r < 0) { + free(new_item->path); + strv_free(new_item->args); + strv_free(new_item->env); + return r; + } + + (*n_array) ++; + } + + return 0; +} + +static int oci_hooks(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + + static const JsonDispatch table[] = { + { "prestart", JSON_VARIANT_OBJECT, oci_hooks_array, 0, 0 }, + { "poststart", JSON_VARIANT_OBJECT, oci_hooks_array, 0, 0 }, + { "poststop", JSON_VARIANT_OBJECT, oci_hooks_array, 0, 0 }, + {} + }; + + return json_dispatch(v, table, oci_unexpected, flags, userdata); +} + +static int oci_annotations(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) { + JsonVariant *w; + const char *k; + + JSON_VARIANT_OBJECT_FOREACH(k, w, v) { + + if (isempty(k)) + return json_log(v, flags, SYNTHETIC_ERRNO(EINVAL), + "Annotation with empty key, refusing."); + + if (!json_variant_is_string(w)) + return json_log(w, flags, SYNTHETIC_ERRNO(EINVAL), + "Annotation has non-string value, refusing."); + + json_log(w, flags|JSON_DEBUG, 0, "Ignoring annotation '%s' with value '%s'.", k, json_variant_string(w)); + } + + return 0; +} + +int oci_load(FILE *f, const char *bundle, Settings **ret) { + + static const JsonDispatch table[] = { + { "ociVersion", JSON_VARIANT_STRING, NULL, 0, JSON_MANDATORY }, + { "process", JSON_VARIANT_OBJECT, oci_process, 0, 0 }, + { "root", JSON_VARIANT_OBJECT, oci_root, 0, 0 }, + { "hostname", JSON_VARIANT_STRING, oci_hostname, 0, 0 }, + { "mounts", JSON_VARIANT_ARRAY, oci_mounts, 0, 0 }, + { "linux", JSON_VARIANT_OBJECT, oci_linux, 0, 0 }, + { "hooks", JSON_VARIANT_OBJECT, oci_hooks, 0, 0 }, + { "annotations", JSON_VARIANT_OBJECT, oci_annotations, 0, 0 }, + {} + }; + + _cleanup_(json_variant_unrefp) JsonVariant *oci = NULL; + _cleanup_(settings_freep) Settings *s = NULL; + unsigned line = 0, column = 0; + JsonVariant *v; + const char *path; + int r; + + assert_se(bundle); + + path = strjoina(bundle, "/config.json"); + + r = json_parse_file(f, path, 0, &oci, &line, &column); + if (r < 0) { + if (line != 0 && column != 0) + return log_error_errno(r, "Failed to parse '%s' at %u:%u: %m", path, line, column); + else + return log_error_errno(r, "Failed to parse '%s': %m", path); + } + + v = json_variant_by_key(oci, "ociVersion"); + if (!v) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "JSON file '%s' is not an OCI bundle configuration file. Refusing.", + path); + if (!streq_ptr(json_variant_string(v), "1.0.0")) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), + "OCI bundle version not supported: %s", + strna(json_variant_string(v))); + + // { + // _cleanup_free_ char *formatted = NULL; + // assert_se(json_variant_format(oci, JSON_FORMAT_PRETTY|JSON_FORMAT_COLOR, &formatted) >= 0); + // fputs(formatted, stdout); + // } + + s = settings_new(); + if (!s) + return log_oom(); + + s->start_mode = START_PID1; + s->resolv_conf = RESOLV_CONF_OFF; + s->link_journal = LINK_NO; + s->timezone = TIMEZONE_OFF; + + s->bundle = strdup(bundle); + if (!s->bundle) + return log_oom(); + + r = json_dispatch(oci, table, oci_unexpected, 0, s); + if (r < 0) + return r; + + if (s->properties) { + r = sd_bus_message_seal(s->properties, 0, 0); + if (r < 0) + return log_error_errno(r, "Cannot seal properties bus message: %m"); + } + + *ret = TAKE_PTR(s); + return 0; +} |