diff options
Diffstat (limited to 'src/basic/virt.c')
-rw-r--r-- | src/basic/virt.c | 1071 |
1 files changed, 1071 insertions, 0 deletions
diff --git a/src/basic/virt.c b/src/basic/virt.c new file mode 100644 index 0000000..93ccfaa --- /dev/null +++ b/src/basic/virt.c @@ -0,0 +1,1071 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if defined(__i386__) || defined(__x86_64__) +#include <cpuid.h> +#endif +#include <errno.h> +#include <stdint.h> +#include <stdlib.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "cgroup-util.h" +#include "dirent-util.h" +#include "env-util.h" +#include "errno-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "macro.h" +#include "missing_threads.h" +#include "process-util.h" +#include "stat-util.h" +#include "string-table.h" +#include "string-util.h" +#include "virt.h" + +enum { + SMBIOS_VM_BIT_SET, + SMBIOS_VM_BIT_UNSET, + SMBIOS_VM_BIT_UNKNOWN, +}; + +static Virtualization detect_vm_cpuid(void) { + + /* CPUID is an x86 specific interface. */ +#if defined(__i386__) || defined(__x86_64__) + + static const struct { + const char sig[13]; + Virtualization id; + } vm_table[] = { + { "XenVMMXenVMM", VIRTUALIZATION_XEN }, + { "KVMKVMKVM", VIRTUALIZATION_KVM }, /* qemu with KVM */ + { "Linux KVM Hv", VIRTUALIZATION_KVM }, /* qemu with KVM + HyperV Enlightenments */ + { "TCGTCGTCGTCG", VIRTUALIZATION_QEMU }, /* qemu without KVM */ + /* http://kb.vmware.com/selfservice/microsites/search.do?language=en_US&cmd=displayKC&externalId=1009458 */ + { "VMwareVMware", VIRTUALIZATION_VMWARE }, + /* https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs */ + { "Microsoft Hv", VIRTUALIZATION_MICROSOFT }, + /* https://wiki.freebsd.org/bhyve */ + { "bhyve bhyve ", VIRTUALIZATION_BHYVE }, + { "QNXQVMBSQG", VIRTUALIZATION_QNX }, + /* https://projectacrn.org */ + { "ACRNACRNACRN", VIRTUALIZATION_ACRN }, + /* https://www.lockheedmartin.com/en-us/products/Hardened-Security-for-Intel-Processors.html */ + { "SRESRESRESRE", VIRTUALIZATION_SRE }, + { "Apple VZ", VIRTUALIZATION_APPLE }, + }; + + uint32_t eax, ebx, ecx, edx; + bool hypervisor; + + /* http://lwn.net/Articles/301888/ */ + + /* First detect whether there is a hypervisor */ + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) == 0) + return VIRTUALIZATION_NONE; + + hypervisor = ecx & 0x80000000U; + + if (hypervisor) { + union { + uint32_t sig32[3]; + char text[13]; + } sig = {}; + + /* There is a hypervisor, see what it is */ + __cpuid(0x40000000U, eax, ebx, ecx, edx); + + sig.sig32[0] = ebx; + sig.sig32[1] = ecx; + sig.sig32[2] = edx; + + log_debug("Virtualization found, CPUID=%s", sig.text); + + for (size_t i = 0; i < ELEMENTSOF(vm_table); i++) + if (memcmp_nn(sig.text, sizeof(sig.text), + vm_table[i].sig, sizeof(vm_table[i].sig)) == 0) + return vm_table[i].id; + + log_debug("Unknown virtualization with CPUID=%s. Add to vm_table[]?", sig.text); + return VIRTUALIZATION_VM_OTHER; + } +#endif + log_debug("No virtualization found in CPUID"); + + return VIRTUALIZATION_NONE; +} + +static Virtualization detect_vm_device_tree(void) { +#if defined(__arm__) || defined(__aarch64__) || defined(__powerpc__) || defined(__powerpc64__) || defined(__riscv) + _cleanup_free_ char *hvtype = NULL; + int r; + + r = read_one_line_file("/proc/device-tree/hypervisor/compatible", &hvtype); + if (r == -ENOENT) { + _cleanup_closedir_ DIR *dir = NULL; + _cleanup_free_ char *compat = NULL; + + if (access("/proc/device-tree/ibm,partition-name", F_OK) == 0 && + access("/proc/device-tree/hmc-managed?", F_OK) == 0 && + access("/proc/device-tree/chosen/qemu,graphic-width", F_OK) != 0) + return VIRTUALIZATION_POWERVM; + + dir = opendir("/proc/device-tree"); + if (!dir) { + if (errno == ENOENT) { + log_debug_errno(errno, "/proc/device-tree: %m"); + return VIRTUALIZATION_NONE; + } + return -errno; + } + + FOREACH_DIRENT(de, dir, return -errno) + if (strstr(de->d_name, "fw-cfg")) { + log_debug("Virtualization QEMU: \"fw-cfg\" present in /proc/device-tree/%s", de->d_name); + return VIRTUALIZATION_QEMU; + } + + r = read_one_line_file("/proc/device-tree/compatible", &compat); + if (r < 0 && r != -ENOENT) + return r; + if (r >= 0 && streq(compat, "qemu,pseries")) { + log_debug("Virtualization %s found in /proc/device-tree/compatible", compat); + return VIRTUALIZATION_QEMU; + } + + log_debug("No virtualization found in /proc/device-tree/*"); + return VIRTUALIZATION_NONE; + } else if (r < 0) + return r; + + log_debug("Virtualization %s found in /proc/device-tree/hypervisor/compatible", hvtype); + if (streq(hvtype, "linux,kvm")) + return VIRTUALIZATION_KVM; + else if (strstr(hvtype, "xen")) + return VIRTUALIZATION_XEN; + else if (strstr(hvtype, "vmware")) + return VIRTUALIZATION_VMWARE; + else + return VIRTUALIZATION_VM_OTHER; +#else + log_debug("This platform does not support /proc/device-tree"); + return VIRTUALIZATION_NONE; +#endif +} + +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || defined(__riscv) +static Virtualization detect_vm_dmi_vendor(void) { + static const char* const dmi_vendors[] = { + "/sys/class/dmi/id/product_name", /* Test this before sys_vendor to detect KVM over QEMU */ + "/sys/class/dmi/id/sys_vendor", + "/sys/class/dmi/id/board_vendor", + "/sys/class/dmi/id/bios_vendor", + "/sys/class/dmi/id/product_version", /* For Hyper-V VMs test */ + NULL + }; + + static const struct { + const char *vendor; + Virtualization id; + } dmi_vendor_table[] = { + { "KVM", VIRTUALIZATION_KVM }, + { "OpenStack", VIRTUALIZATION_KVM }, /* Detect OpenStack instance as KVM in non x86 architecture */ + { "KubeVirt", VIRTUALIZATION_KVM }, /* Detect KubeVirt instance as KVM in non x86 architecture */ + { "Amazon EC2", VIRTUALIZATION_AMAZON }, + { "QEMU", VIRTUALIZATION_QEMU }, + { "VMware", VIRTUALIZATION_VMWARE }, /* https://kb.vmware.com/s/article/1009458 */ + { "VMW", VIRTUALIZATION_VMWARE }, + { "innotek GmbH", VIRTUALIZATION_ORACLE }, + { "VirtualBox", VIRTUALIZATION_ORACLE }, + { "Xen", VIRTUALIZATION_XEN }, + { "Bochs", VIRTUALIZATION_BOCHS }, + { "Parallels", VIRTUALIZATION_PARALLELS }, + /* https://wiki.freebsd.org/bhyve */ + { "BHYVE", VIRTUALIZATION_BHYVE }, + { "Hyper-V", VIRTUALIZATION_MICROSOFT }, + { "Apple Virtualization", VIRTUALIZATION_APPLE }, + { "Google Compute Engine", VIRTUALIZATION_GOOGLE }, /* https://cloud.google.com/run/docs/container-contract#sandbox */ + }; + int r; + + STRV_FOREACH(vendor, dmi_vendors) { + _cleanup_free_ char *s = NULL; + + r = read_one_line_file(*vendor, &s); + if (r < 0) { + if (r == -ENOENT) + continue; + + return r; + } + + for (size_t i = 0; i < ELEMENTSOF(dmi_vendor_table); i++) + if (startswith(s, dmi_vendor_table[i].vendor)) { + log_debug("Virtualization %s found in DMI (%s)", s, *vendor); + return dmi_vendor_table[i].id; + } + } + log_debug("No virtualization found in DMI vendor table."); + return VIRTUALIZATION_NONE; +} + +static int detect_vm_smbios(void) { + /* The SMBIOS BIOS Characteristics Extension Byte 2 (Section 2.1.2.2 of + * https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.4.0.pdf), specifies that + * the 4th bit being set indicates a VM. The BIOS Characteristics table is exposed via the kernel in + * /sys/firmware/dmi/entries/0-0. Note that in the general case, this bit being unset should not + * imply that the system is running on bare-metal. For example, QEMU 3.1.0 (with or without KVM) + * with SeaBIOS does not set this bit. */ + _cleanup_free_ char *s = NULL; + size_t readsize; + int r; + + r = read_full_virtual_file("/sys/firmware/dmi/entries/0-0/raw", &s, &readsize); + if (r < 0) { + log_debug_errno(r, "Unable to read /sys/firmware/dmi/entries/0-0/raw, " + "using the virtualization information found in DMI vendor table, ignoring: %m"); + return SMBIOS_VM_BIT_UNKNOWN; + } + if (readsize < 20 || s[1] < 20) { + /* The spec indicates that byte 1 contains the size of the table, 0x12 + the number of + * extension bytes. The data we're interested in is in extension byte 2, which would be at + * 0x13. If we didn't read that much data, or if the BIOS indicates that we don't have that + * much data, we don't infer anything from the SMBIOS. */ + log_debug("Only read %zu bytes from /sys/firmware/dmi/entries/0-0/raw (expected 20). " + "Using the virtualization information found in DMI vendor table.", readsize); + return SMBIOS_VM_BIT_UNKNOWN; + } + + uint8_t byte = (uint8_t) s[19]; + if (byte & (1U<<4)) { + log_debug("DMI BIOS Extension table indicates virtualization."); + return SMBIOS_VM_BIT_SET; + } + log_debug("DMI BIOS Extension table does not indicate virtualization."); + return SMBIOS_VM_BIT_UNSET; +} +#endif /* defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) */ + +static Virtualization detect_vm_dmi(void) { +#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) + + int r; + r = detect_vm_dmi_vendor(); + + /* The DMI vendor tables in /sys/class/dmi/id don't help us distinguish between Amazon EC2 + * virtual machines and bare-metal instances, so we need to look at SMBIOS. */ + if (r == VIRTUALIZATION_AMAZON) { + switch (detect_vm_smbios()) { + case SMBIOS_VM_BIT_SET: + return VIRTUALIZATION_AMAZON; + case SMBIOS_VM_BIT_UNSET: + return VIRTUALIZATION_NONE; + case SMBIOS_VM_BIT_UNKNOWN: { + /* The DMI information we are after is only accessible to the root user, + * so we fallback to using the product name which is less restricted + * to distinguish metal systems from virtualized instances */ + _cleanup_free_ char *s = NULL; + const char *e; + + r = read_full_virtual_file("/sys/class/dmi/id/product_name", &s, NULL); + /* In EC2, virtualized is much more common than metal, so if for some reason + * we fail to read the DMI data, assume we are virtualized. */ + if (r < 0) { + log_debug_errno(r, "Can't read /sys/class/dmi/id/product_name," + " assuming virtualized: %m"); + return VIRTUALIZATION_AMAZON; + } + e = strstrafter(truncate_nl(s), ".metal"); + if (e && IN_SET(*e, 0, '-')) { + log_debug("DMI product name has '.metal', assuming no virtualization"); + return VIRTUALIZATION_NONE; + } else + return VIRTUALIZATION_AMAZON; + } + default: + assert_not_reached(); + } + } + + /* If we haven't identified a VM, but the firmware indicates that there is one, indicate as much. We + * have no further information about what it is. */ + if (r == VIRTUALIZATION_NONE && detect_vm_smbios() == SMBIOS_VM_BIT_SET) + return VIRTUALIZATION_VM_OTHER; + return r; +#else + return VIRTUALIZATION_NONE; +#endif +} + +#define XENFEAT_dom0 11 /* xen/include/public/features.h */ +#define PATH_FEATURES "/sys/hypervisor/properties/features" +/* Returns -errno, or 0 for domU, or 1 for dom0 */ +static int detect_vm_xen_dom0(void) { + _cleanup_free_ char *domcap = NULL; + int r; + + r = read_one_line_file(PATH_FEATURES, &domcap); + if (r < 0 && r != -ENOENT) + return r; + if (r >= 0) { + unsigned long features; + + /* Here, we need to use sscanf() instead of safe_atoul() + * as the string lacks the leading "0x". */ + r = sscanf(domcap, "%lx", &features); + if (r == 1) { + r = !!(features & (1U << XENFEAT_dom0)); + log_debug("Virtualization XEN, found %s with value %08lx, " + "XENFEAT_dom0 (indicating the 'hardware domain') is%s set.", + PATH_FEATURES, features, r ? "" : " not"); + return r; + } + log_debug("Virtualization XEN, found %s, unhandled content '%s'", + PATH_FEATURES, domcap); + } + + r = read_one_line_file("/proc/xen/capabilities", &domcap); + if (r == -ENOENT) { + log_debug("Virtualization XEN because /proc/xen/capabilities does not exist"); + return 0; + } + if (r < 0) + return r; + + for (const char *i = domcap;;) { + _cleanup_free_ char *cap = NULL; + + r = extract_first_word(&i, &cap, ",", 0); + if (r < 0) + return r; + if (r == 0) { + log_debug("Virtualization XEN DomU found (/proc/xen/capabilities)"); + return 0; + } + + if (streq(cap, "control_d")) { + log_debug("Virtualization XEN Dom0 ignored (/proc/xen/capabilities)"); + return 1; + } + } +} + +static Virtualization detect_vm_xen(void) { + /* The presence of /proc/xen indicates some form of a Xen domain + The check for Dom0 is handled outside this function */ + if (access("/proc/xen", F_OK) < 0) { + log_debug("Virtualization XEN not found, /proc/xen does not exist"); + return VIRTUALIZATION_NONE; + } + log_debug("Virtualization XEN found (/proc/xen exists)"); + return VIRTUALIZATION_XEN; +} + +static Virtualization detect_vm_hypervisor(void) { + _cleanup_free_ char *hvtype = NULL; + int r; + + r = read_one_line_file("/sys/hypervisor/type", &hvtype); + if (r == -ENOENT) + return VIRTUALIZATION_NONE; + if (r < 0) + return r; + + log_debug("Virtualization %s found in /sys/hypervisor/type", hvtype); + + if (streq(hvtype, "xen")) + return VIRTUALIZATION_XEN; + else + return VIRTUALIZATION_VM_OTHER; +} + +static Virtualization detect_vm_uml(void) { + _cleanup_fclose_ FILE *f = NULL; + int r; + + /* Detect User-Mode Linux by reading /proc/cpuinfo */ + f = fopen("/proc/cpuinfo", "re"); + if (!f) { + if (errno == ENOENT) { + log_debug("/proc/cpuinfo not found, assuming no UML virtualization."); + return VIRTUALIZATION_NONE; + } + return -errno; + } + + for (;;) { + _cleanup_free_ char *line = NULL; + const char *t; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; + + t = startswith(line, "vendor_id\t: "); + if (t) { + if (startswith(t, "User Mode Linux")) { + log_debug("UML virtualization found in /proc/cpuinfo"); + return VIRTUALIZATION_UML; + } + + break; + } + } + + log_debug("UML virtualization not found in /proc/cpuinfo."); + return VIRTUALIZATION_NONE; +} + +static Virtualization detect_vm_zvm(void) { + +#if defined(__s390__) + _cleanup_free_ char *t = NULL; + int r; + + r = get_proc_field("/proc/sysinfo", "VM00 Control Program", WHITESPACE, &t); + if (r == -ENOENT) + return VIRTUALIZATION_NONE; + if (r < 0) + return r; + + log_debug("Virtualization %s found in /proc/sysinfo", t); + if (streq(t, "z/VM")) + return VIRTUALIZATION_ZVM; + else + return VIRTUALIZATION_KVM; +#else + log_debug("This platform does not support /proc/sysinfo"); + return VIRTUALIZATION_NONE; +#endif +} + +/* Returns a short identifier for the various VM implementations */ +Virtualization detect_vm(void) { + static thread_local Virtualization cached_found = _VIRTUALIZATION_INVALID; + bool other = false; + int xen_dom0 = 0; + Virtualization v, dmi; + + if (cached_found >= 0) + return cached_found; + + /* We have to use the correct order here: + * + * → First, try to detect Oracle Virtualbox, Amazon EC2 Nitro, Parallels, and Google Compute Engine, even if they use KVM, + * as well as Xen even if it cloaks as Microsoft Hyper-V. Attempt to detect uml at this stage also + * since it runs as a user-process nested inside other VMs. Also check for Xen now, because Xen PV + * mode does not override CPUID when nested inside another hypervisor. + * + * → Second, try to detect from CPUID, this will report KVM for whatever software is used even if + * info in DMI is overwritten. + * + * → Third, try to detect from DMI. */ + + dmi = detect_vm_dmi(); + if (IN_SET(dmi, + VIRTUALIZATION_ORACLE, + VIRTUALIZATION_XEN, + VIRTUALIZATION_AMAZON, + VIRTUALIZATION_PARALLELS, + VIRTUALIZATION_GOOGLE)) { + v = dmi; + goto finish; + } + + /* Detect UML */ + v = detect_vm_uml(); + if (v < 0) + return v; + if (v != VIRTUALIZATION_NONE) + goto finish; + + /* Detect Xen */ + v = detect_vm_xen(); + if (v < 0) + return v; + if (v == VIRTUALIZATION_XEN) { + /* If we are Dom0, then we expect to not report as a VM. However, as we might be nested + * inside another hypervisor which can be detected via the CPUID check, wait to report this + * until after the CPUID check. */ + xen_dom0 = detect_vm_xen_dom0(); + if (xen_dom0 < 0) + return xen_dom0; + if (xen_dom0 == 0) + goto finish; + } else if (v != VIRTUALIZATION_NONE) + assert_not_reached(); + + /* Detect from CPUID */ + v = detect_vm_cpuid(); + if (v < 0) + return v; + if (v == VIRTUALIZATION_VM_OTHER) + other = true; + else if (v != VIRTUALIZATION_NONE) + goto finish; + + /* If we are in Dom0 and have not yet finished, finish with the result of detect_vm_cpuid */ + if (xen_dom0 > 0) + goto finish; + + /* Now, let's get back to DMI */ + if (dmi < 0) + return dmi; + if (dmi == VIRTUALIZATION_VM_OTHER) + other = true; + else if (dmi != VIRTUALIZATION_NONE) { + v = dmi; + goto finish; + } + + /* Check high-level hypervisor sysfs file */ + v = detect_vm_hypervisor(); + if (v < 0) + return v; + if (v == VIRTUALIZATION_VM_OTHER) + other = true; + else if (v != VIRTUALIZATION_NONE) + goto finish; + + v = detect_vm_device_tree(); + if (v < 0) + return v; + if (v == VIRTUALIZATION_VM_OTHER) + other = true; + else if (v != VIRTUALIZATION_NONE) + goto finish; + + v = detect_vm_zvm(); + if (v < 0) + return v; + +finish: + if (v == VIRTUALIZATION_NONE && other) + v = VIRTUALIZATION_VM_OTHER; + + cached_found = v; + log_debug("Found VM virtualization %s", virtualization_to_string(v)); + return v; +} + +static const char *const container_table[_VIRTUALIZATION_MAX] = { + [VIRTUALIZATION_LXC] = "lxc", + [VIRTUALIZATION_LXC_LIBVIRT] = "lxc-libvirt", + [VIRTUALIZATION_SYSTEMD_NSPAWN] = "systemd-nspawn", + [VIRTUALIZATION_DOCKER] = "docker", + [VIRTUALIZATION_PODMAN] = "podman", + [VIRTUALIZATION_RKT] = "rkt", + [VIRTUALIZATION_WSL] = "wsl", + [VIRTUALIZATION_PROOT] = "proot", + [VIRTUALIZATION_POUCH] = "pouch", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_FROM_STRING(container, int); + +static int running_in_cgroupns(void) { + int r; + + if (!cg_ns_supported()) + return false; + + r = cg_all_unified(); + if (r < 0) + return r; + + if (r) { + /* cgroup v2 */ + + r = access("/sys/fs/cgroup/cgroup.events", F_OK); + if (r < 0) { + if (errno != ENOENT) + return -errno; + /* All kernel versions have cgroup.events in nested cgroups. */ + return false; + } + + /* There's no cgroup.type in the root cgroup, and future kernel versions + * are unlikely to add it since cgroup.type is something that makes no sense + * whatsoever in the root cgroup. */ + r = access("/sys/fs/cgroup/cgroup.type", F_OK); + if (r == 0) + return true; + if (r < 0 && errno != ENOENT) + return -errno; + + /* On older kernel versions, there's no cgroup.type */ + r = access("/sys/kernel/cgroup/features", F_OK); + if (r < 0) { + if (errno != ENOENT) + return -errno; + /* This is an old kernel that we know for sure has cgroup.events + * only in nested cgroups. */ + return true; + } + + /* This is a recent kernel, and cgroup.type doesn't exist, so we must be + * in the root cgroup. */ + return false; + } else { + /* cgroup v1 */ + + /* If systemd controller is not mounted, do not even bother. */ + r = access("/sys/fs/cgroup/systemd", F_OK); + if (r < 0) { + if (errno != ENOENT) + return -errno; + return false; + } + + /* release_agent only exists in the root cgroup. */ + r = access("/sys/fs/cgroup/systemd/release_agent", F_OK); + if (r < 0) { + if (errno != ENOENT) + return -errno; + return true; + } + + return false; + } +} + +static Virtualization detect_container_files(void) { + static const struct { + const char *file_path; + Virtualization id; + } container_file_table[] = { + /* https://github.com/containers/podman/issues/6192 */ + /* https://github.com/containers/podman/issues/3586#issuecomment-661918679 */ + { "/run/.containerenv", VIRTUALIZATION_PODMAN }, + /* https://github.com/moby/moby/issues/18355 */ + /* Docker must be the last in this table, see below. */ + { "/.dockerenv", VIRTUALIZATION_DOCKER }, + }; + + for (size_t i = 0; i < ELEMENTSOF(container_file_table); i++) { + if (access(container_file_table[i].file_path, F_OK) >= 0) + return container_file_table[i].id; + + if (errno != ENOENT) + log_debug_errno(errno, + "Checking if %s exists failed, ignoring: %m", + container_file_table[i].file_path); + } + + return VIRTUALIZATION_NONE; +} + +Virtualization detect_container(void) { + static thread_local Virtualization cached_found = _VIRTUALIZATION_INVALID; + _cleanup_free_ char *m = NULL, *o = NULL, *p = NULL; + const char *e = NULL; + Virtualization v; + int r; + + if (cached_found >= 0) + return cached_found; + + /* /proc/vz exists in container and outside of the container, /proc/bc only outside of the container. */ + if (access("/proc/vz", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check if /proc/vz exists, ignoring: %m"); + } else if (access("/proc/bc", F_OK) < 0) { + if (errno == ENOENT) { + v = VIRTUALIZATION_OPENVZ; + goto finish; + } + + log_debug_errno(errno, "Failed to check if /proc/bc exists, ignoring: %m"); + } + + /* "Official" way of detecting WSL https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 */ + r = read_one_line_file("/proc/sys/kernel/osrelease", &o); + if (r < 0) + log_debug_errno(r, "Failed to read /proc/sys/kernel/osrelease, ignoring: %m"); + else if (strstr(o, "Microsoft") || strstr(o, "WSL")) { + v = VIRTUALIZATION_WSL; + goto finish; + } + + /* proot doesn't use PID namespacing, so we can just check if we have a matching tracer for this + * invocation without worrying about it being elsewhere. + */ + r = get_proc_field("/proc/self/status", "TracerPid", WHITESPACE, &p); + if (r < 0) + log_debug_errno(r, "Failed to read our own trace PID, ignoring: %m"); + else if (!streq(p, "0")) { + pid_t ptrace_pid; + + r = parse_pid(p, &ptrace_pid); + if (r < 0) + log_debug_errno(r, "Failed to parse our own tracer PID, ignoring: %m"); + else { + _cleanup_free_ char *ptrace_comm = NULL; + const char *pf; + + pf = procfs_file_alloca(ptrace_pid, "comm"); + r = read_one_line_file(pf, &ptrace_comm); + if (r < 0) + log_debug_errno(r, "Failed to read %s, ignoring: %m", pf); + else if (startswith(ptrace_comm, "proot")) { + v = VIRTUALIZATION_PROOT; + goto finish; + } + } + } + + /* The container manager might have placed this in the /run/host/ hierarchy for us, which is best + * because we can be consumed just like that, without special privileges. */ + r = read_one_line_file("/run/host/container-manager", &m); + if (r > 0) { + e = m; + goto translate_name; + } + if (!IN_SET(r, -ENOENT, 0)) + return log_debug_errno(r, "Failed to read /run/host/container-manager: %m"); + + if (getpid_cached() == 1) { + /* If we are PID 1 we can just check our own environment variable, and that's authoritative. + * We distinguish three cases: + * - the variable is not defined → we jump to other checks + * - the variable is defined to an empty value → we are not in a container + * - anything else → some container, either one of the known ones or "container-other" + */ + e = getenv("container"); + if (!e) + goto check_files; + if (isempty(e)) { + v = VIRTUALIZATION_NONE; + goto finish; + } + + goto translate_name; + } + + /* Otherwise, PID 1 might have dropped this information into a file in /run. This is better than accessing + * /proc/1/environ, since we don't need CAP_SYS_PTRACE for that. */ + r = read_one_line_file("/run/systemd/container", &m); + if (r > 0) { + e = m; + goto translate_name; + } + if (!IN_SET(r, -ENOENT, 0)) + return log_debug_errno(r, "Failed to read /run/systemd/container: %m"); + + /* Fallback for cases where PID 1 was not systemd (for example, cases where init=/bin/sh is used. */ + r = getenv_for_pid(1, "container", &m); + if (r > 0) { + e = m; + goto translate_name; + } + if (r < 0) /* This only works if we have CAP_SYS_PTRACE, hence let's better ignore failures here */ + log_debug_errno(r, "Failed to read $container of PID 1, ignoring: %m"); + +check_files: + /* Check for existence of some well-known files. We only do this after checking + * for other specific container managers, otherwise we risk mistaking another + * container manager for Docker: the /.dockerenv file could inadvertently end up + * in a file system image. */ + v = detect_container_files(); + if (v < 0) + return v; + if (v != VIRTUALIZATION_NONE) + goto finish; + + r = running_in_cgroupns(); + if (r > 0) { + v = VIRTUALIZATION_CONTAINER_OTHER; + goto finish; + } + if (r < 0) + log_debug_errno(r, "Failed to detect cgroup namespace: %m"); + + /* If none of that worked, give up, assume no container manager. */ + v = VIRTUALIZATION_NONE; + goto finish; + +translate_name: + if (streq(e, "oci")) { + /* Some images hardcode container=oci, but OCI is not a specific container manager. + * Try to detect one based on well-known files. */ + v = detect_container_files(); + if (v == VIRTUALIZATION_NONE) + v = VIRTUALIZATION_CONTAINER_OTHER; + goto finish; + } + v = container_from_string(e); + if (v < 0) + v = VIRTUALIZATION_CONTAINER_OTHER; + +finish: + log_debug("Found container virtualization %s.", virtualization_to_string(v)); + cached_found = v; + return v; +} + +Virtualization detect_virtualization(void) { + int v; + + v = detect_container(); + if (v != VIRTUALIZATION_NONE) + return v; + + return detect_vm(); +} + +static int userns_has_mapping(const char *name) { + _cleanup_fclose_ FILE *f = NULL; + uid_t a, b, c; + int r; + + f = fopen(name, "re"); + if (!f) { + log_debug_errno(errno, "Failed to open %s: %m", name); + return errno == ENOENT ? false : -errno; + } + + errno = 0; + r = fscanf(f, UID_FMT " " UID_FMT " " UID_FMT "\n", &a, &b, &c); + if (r == EOF) { + if (ferror(f)) + return log_debug_errno(errno_or_else(EIO), "Failed to read %s: %m", name); + + log_debug("%s is empty, we're in an uninitialized user namespace", name); + return true; + } + if (r != 3) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Failed to parse %s: %m", name); + + if (a == 0 && b == 0 && c == UINT32_MAX) { + /* The kernel calls mappings_overlap() and does not allow overlaps */ + log_debug("%s has a full 1:1 mapping", name); + return false; + } + + /* Anything else implies that we are in a user namespace */ + log_debug("Mapping found in %s, we're in a user namespace", name); + return true; +} + +int running_in_userns(void) { + _cleanup_free_ char *line = NULL; + int r; + + r = userns_has_mapping("/proc/self/uid_map"); + if (r != 0) + return r; + + r = userns_has_mapping("/proc/self/gid_map"); + if (r != 0) + return r; + + /* "setgroups" file was added in kernel v3.18-rc6-15-g9cc46516dd. It is also possible to compile a + * kernel without CONFIG_USER_NS, in which case "setgroups" also does not exist. We cannot + * distinguish those two cases, so assume that we're running on a stripped-down recent kernel, rather + * than on an old one, and if the file is not found, return false. */ + r = read_virtual_file("/proc/self/setgroups", SIZE_MAX, &line, NULL); + if (r < 0) { + log_debug_errno(r, "/proc/self/setgroups: %m"); + return r == -ENOENT ? false : r; + } + + strstrip(line); /* remove trailing newline */ + + r = streq(line, "deny"); + /* See user_namespaces(7) for a description of this "setgroups" contents. */ + log_debug("/proc/self/setgroups contains \"%s\", %s user namespace", line, r ? "in" : "not in"); + return r; +} + +int running_in_chroot(void) { + int r; + + /* If we're PID1, /proc may not be mounted (and most likely we're not in a chroot). But PID1 will + * mount /proc, so all other programs can assume that if /proc is *not* available, we're in some + * chroot. */ + + if (getenv_bool("SYSTEMD_IGNORE_CHROOT") > 0) + return 0; + + r = inode_same("/proc/1/root", "/", 0); + if (r == -ENOENT) { + r = proc_mounted(); + if (r == 0) { + if (getpid_cached() == 1) + return false; /* We will mount /proc, assuming we're not in a chroot. */ + + log_debug("/proc is not mounted, assuming we're in a chroot."); + return true; + } + if (r > 0) /* If we have fake /proc/, we can't do the check properly. */ + return -ENOSYS; + } + if (r < 0) + return r; + + return r == 0; +} + +#if defined(__i386__) || defined(__x86_64__) +struct cpuid_table_entry { + uint32_t flag_bit; + const char *name; +}; + +static const struct cpuid_table_entry leaf1_edx[] = { + { 0, "fpu" }, + { 1, "vme" }, + { 2, "de" }, + { 3, "pse" }, + { 4, "tsc" }, + { 5, "msr" }, + { 6, "pae" }, + { 7, "mce" }, + { 8, "cx8" }, + { 9, "apic" }, + { 11, "sep" }, + { 12, "mtrr" }, + { 13, "pge" }, + { 14, "mca" }, + { 15, "cmov" }, + { 16, "pat" }, + { 17, "pse36" }, + { 19, "clflush" }, + { 23, "mmx" }, + { 24, "fxsr" }, + { 25, "sse" }, + { 26, "sse2" }, + { 28, "ht" }, +}; + +static const struct cpuid_table_entry leaf1_ecx[] = { + { 0, "pni" }, + { 1, "pclmul" }, + { 3, "monitor" }, + { 9, "ssse3" }, + { 12, "fma3" }, + { 13, "cx16" }, + { 19, "sse4_1" }, + { 20, "sse4_2" }, + { 22, "movbe" }, + { 23, "popcnt" }, + { 25, "aes" }, + { 26, "xsave" }, + { 27, "osxsave" }, + { 28, "avx" }, + { 29, "f16c" }, + { 30, "rdrand" }, +}; + +static const struct cpuid_table_entry leaf7_ebx[] = { + { 3, "bmi1" }, + { 5, "avx2" }, + { 8, "bmi2" }, + { 18, "rdseed" }, + { 19, "adx" }, + { 29, "sha_ni" }, +}; + +static const struct cpuid_table_entry leaf81_edx[] = { + { 11, "syscall" }, + { 27, "rdtscp" }, + { 29, "lm" }, +}; + +static const struct cpuid_table_entry leaf81_ecx[] = { + { 0, "lahf_lm" }, + { 5, "abm" }, +}; + +static const struct cpuid_table_entry leaf87_edx[] = { + { 8, "constant_tsc" }, +}; + +static bool given_flag_in_set(const char *flag, const struct cpuid_table_entry *set, size_t set_size, uint32_t val) { + for (size_t i = 0; i < set_size; i++) { + if ((UINT32_C(1) << set[i].flag_bit) & val && + streq(flag, set[i].name)) + return true; + } + return false; +} + +static bool real_has_cpu_with_flag(const char *flag) { + uint32_t eax, ebx, ecx, edx; + + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) { + if (given_flag_in_set(flag, leaf1_ecx, ELEMENTSOF(leaf1_ecx), ecx)) + return true; + + if (given_flag_in_set(flag, leaf1_edx, ELEMENTSOF(leaf1_edx), edx)) + return true; + } + + if (__get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx)) { + if (given_flag_in_set(flag, leaf7_ebx, ELEMENTSOF(leaf7_ebx), ebx)) + return true; + } + + if (__get_cpuid(0x80000001U, &eax, &ebx, &ecx, &edx)) { + if (given_flag_in_set(flag, leaf81_ecx, ELEMENTSOF(leaf81_ecx), ecx)) + return true; + + if (given_flag_in_set(flag, leaf81_edx, ELEMENTSOF(leaf81_edx), edx)) + return true; + } + + if (__get_cpuid(0x80000007U, &eax, &ebx, &ecx, &edx)) + if (given_flag_in_set(flag, leaf87_edx, ELEMENTSOF(leaf87_edx), edx)) + return true; + + return false; +} +#endif + +bool has_cpu_with_flag(const char *flag) { + /* CPUID is an x86 specific interface. Assume on all others that no CPUs have those flags. */ +#if defined(__i386__) || defined(__x86_64__) + return real_has_cpu_with_flag(flag); +#else + return false; +#endif +} + +static const char *const virtualization_table[_VIRTUALIZATION_MAX] = { + [VIRTUALIZATION_NONE] = "none", + [VIRTUALIZATION_KVM] = "kvm", + [VIRTUALIZATION_AMAZON] = "amazon", + [VIRTUALIZATION_QEMU] = "qemu", + [VIRTUALIZATION_BOCHS] = "bochs", + [VIRTUALIZATION_XEN] = "xen", + [VIRTUALIZATION_UML] = "uml", + [VIRTUALIZATION_VMWARE] = "vmware", + [VIRTUALIZATION_ORACLE] = "oracle", + [VIRTUALIZATION_MICROSOFT] = "microsoft", + [VIRTUALIZATION_ZVM] = "zvm", + [VIRTUALIZATION_PARALLELS] = "parallels", + [VIRTUALIZATION_BHYVE] = "bhyve", + [VIRTUALIZATION_QNX] = "qnx", + [VIRTUALIZATION_ACRN] = "acrn", + [VIRTUALIZATION_POWERVM] = "powervm", + [VIRTUALIZATION_APPLE] = "apple", + [VIRTUALIZATION_SRE] = "sre", + [VIRTUALIZATION_GOOGLE] = "google", + [VIRTUALIZATION_VM_OTHER] = "vm-other", + + [VIRTUALIZATION_SYSTEMD_NSPAWN] = "systemd-nspawn", + [VIRTUALIZATION_LXC_LIBVIRT] = "lxc-libvirt", + [VIRTUALIZATION_LXC] = "lxc", + [VIRTUALIZATION_OPENVZ] = "openvz", + [VIRTUALIZATION_DOCKER] = "docker", + [VIRTUALIZATION_PODMAN] = "podman", + [VIRTUALIZATION_RKT] = "rkt", + [VIRTUALIZATION_WSL] = "wsl", + [VIRTUALIZATION_PROOT] = "proot", + [VIRTUALIZATION_POUCH] = "pouch", + [VIRTUALIZATION_CONTAINER_OTHER] = "container-other", +}; + +DEFINE_STRING_TABLE_LOOKUP(virtualization, Virtualization); |