diff options
Diffstat (limited to '')
23 files changed, 2927 insertions, 0 deletions
diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c new file mode 100644 index 0000000..3af9e78 --- /dev/null +++ b/src/core/bpf-devices.c @@ -0,0 +1,531 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <fnmatch.h> +#include <linux/bpf_insn.h> + +#include "bpf-devices.h" +#include "bpf-program.h" +#include "devnum-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "nulstr-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "stdio-util.h" +#include "string-util.h" + +#define PASS_JUMP_OFF 4096 + +static int bpf_access_type(const char *acc) { + int r = 0; + + assert(acc); + + for (; *acc; acc++) + switch (*acc) { + case 'r': + r |= BPF_DEVCG_ACC_READ; + break; + case 'w': + r |= BPF_DEVCG_ACC_WRITE; + break; + case 'm': + r |= BPF_DEVCG_ACC_MKNOD; + break; + default: + return -EINVAL; + } + + return r; +} + +static int bpf_prog_allow_list_device( + BPFProgram *prog, + char type, + int major, + int minor, + const char *acc) { + + int r, access; + + assert(prog); + assert(acc); + + log_trace("%s: %c %d:%d %s", __func__, type, major, minor, acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 4), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 3), /* compare device type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 2), /* compare major */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_5, minor, 1), /* compare minor */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +static int bpf_prog_allow_list_major( + BPFProgram *prog, + char type, + int major, + const char *acc) { + + int r, access; + + assert(prog); + assert(acc); + + log_trace("%s: %c %d:* %s", __func__, type, major, acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 3), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 2), /* compare device type */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_4, major, 1), /* compare major */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +static int bpf_prog_allow_list_class( + BPFProgram *prog, + char type, + const char *acc) { + + int r, access; + + assert(prog); + assert(acc); + + log_trace("%s: %c *:* %s", __func__, type, acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + assert(IN_SET(type, 'b', 'c')); + const int bpf_type = type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK; + + const struct bpf_insn insn[] = { + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, access), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 2), /* compare access type */ + + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, bpf_type, 1), /* compare device type */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + + if (FLAGS_SET(access, BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)) + r = bpf_program_add_instructions(prog, insn + 3, ELEMENTSOF(insn) - 3); + else + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + +int bpf_devices_cgroup_init( + BPFProgram **ret, + CGroupDevicePolicy policy, + bool allow_list) { + + const struct bpf_insn pre_insn[] = { + /* load device type to r2 */ + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_AND, BPF_REG_2, 0xFFFF), + + /* load access type to r3 */ + BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, access_type)), + BPF_ALU32_IMM(BPF_RSH, BPF_REG_3, 16), + + /* load major number to r4 */ + BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, major)), + + /* load minor number to r5 */ + BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1, + offsetof(struct bpf_cgroup_dev_ctx, minor)), + }; + + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + int r; + + assert(ret); + + if (policy == CGROUP_DEVICE_POLICY_AUTO && !allow_list) + return 0; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &prog); + if (r < 0) + return log_error_errno(r, "Loading device control BPF program failed: %m"); + + if (policy == CGROUP_DEVICE_POLICY_CLOSED || allow_list) { + r = bpf_program_add_instructions(prog, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + } + + *ret = TAKE_PTR(prog); + + return 0; +} + +int bpf_devices_apply_policy( + BPFProgram **prog, + CGroupDevicePolicy policy, + bool allow_list, + const char *cgroup_path, + BPFProgram **prog_installed) { + + _cleanup_free_ char *controller_path = NULL; + int r; + + /* This will assign *prog_installed if everything goes well. */ + + assert(prog); + if (!*prog) + goto finish; + + const bool deny_everything = policy == CGROUP_DEVICE_POLICY_STRICT && !allow_list; + + const struct bpf_insn post_insn[] = { + /* return DENY */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + }; + + const struct bpf_insn exit_insn[] = { + /* finally return DENY if deny_everything else ALLOW */ + BPF_MOV64_IMM(BPF_REG_0, deny_everything ? 0 : 1), + BPF_EXIT_INSN() + }; + + if (!deny_everything) { + r = bpf_program_add_instructions(*prog, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + /* Fixup PASS_JUMP_OFF jump offsets. */ + for (size_t off = 0; off < (*prog)->n_instructions; off++) { + struct bpf_insn *ins = &((*prog)->instructions[off]); + + if (ins->code == (BPF_JMP | BPF_JA) && ins->off == PASS_JUMP_OFF) + ins->off = (*prog)->n_instructions - off - 1; + } + } + + r = bpf_program_add_instructions(*prog, exit_insn, ELEMENTSOF(exit_insn)); + if (r < 0) + return log_error_errno(r, "Extending device control BPF program failed: %m"); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, NULL, &controller_path); + if (r < 0) + return log_error_errno(r, "Failed to determine cgroup path: %m"); + + r = bpf_program_cgroup_attach(*prog, BPF_CGROUP_DEVICE, controller_path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_error_errno(r, "Attaching device control BPF program to cgroup %s failed: %m", + empty_to_root(cgroup_path)); + + finish: + /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program. */ + if (prog_installed) { + bpf_program_free(*prog_installed); + *prog_installed = TAKE_PTR(*prog); + } + return 0; +} + +int bpf_devices_supported(void) { + const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_freep) BPFProgram *program = NULL; + static int supported = -1; + int r; + + /* Checks whether BPF device controller is supported. For this, we check five things: + * + * a) whether we are privileged + * b) whether the unified hierarchy is being used + * c) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_DEVICE programs, which we require + */ + + if (supported >= 0) + return supported; + + if (geteuid() != 0) { + log_debug("Not enough privileges, BPF device control is not supported."); + return supported = 0; + } + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + log_debug("Not running with unified cgroups, BPF device control is not supported."); + return supported = 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_DEVICE, "sd_devices", &program); + if (r < 0) { + log_debug_errno(r, "Can't allocate CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + log_debug_errno(r, "Can't add trivial instructions to CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + log_debug_errno(r, "Can't load kernel CGROUP DEVICE BPF program, BPF device control is not supported: %m"); + return supported = 0; + } + + return supported = 1; +} + +static int allow_list_device_pattern( + BPFProgram *prog, + const char *path, + char type, + const unsigned *maj, + const unsigned *min, + const char *acc) { + + assert(IN_SET(type, 'b', 'c')); + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + if (maj && min) + return bpf_prog_allow_list_device(prog, type, *maj, *min, acc); + else if (maj) + return bpf_prog_allow_list_major(prog, type, *maj, acc); + else + return bpf_prog_allow_list_class(prog, type, acc); + + } else { + char buf[2+DECIMAL_STR_MAX(unsigned)*2+2+4]; + int r; + + if (maj && min) + xsprintf(buf, "%c %u:%u %s", type, *maj, *min, acc); + else if (maj) + xsprintf(buf, "%c %u:* %s", type, *maj, acc); + else + xsprintf(buf, "%c *:* %s", type, acc); + + /* Changing the devices list of a populated cgroup might result in EINVAL, hence ignore + * EINVAL here. */ + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, + r, "Failed to set devices.allow on %s: %m", path); + + return r; + } +} + +int bpf_devices_allow_list_device( + BPFProgram *prog, + const char *path, + const char *node, + const char *acc) { + + mode_t mode; + dev_t rdev; + int r; + + assert(path); + assert(acc); + assert(strlen(acc) <= 3); + + log_trace("%s: %s %s", __func__, node, acc); + + /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and + * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This + * means clients can use these path without the device node actually around */ + r = device_path_parse_major_minor(node, &mode, &rdev); + if (r < 0) { + if (r != -ENODEV) + return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); + + struct stat st; + if (stat(node, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device %s: %m", node); + + if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) + return log_warning_errno(SYNTHETIC_ERRNO(ENODEV), "%s is not a device.", node); + + mode = st.st_mode; + rdev = (dev_t) st.st_rdev; + } + + unsigned maj = major(rdev), min = minor(rdev); + return allow_list_device_pattern(prog, path, S_ISCHR(mode) ? 'c' : 'b', &maj, &min, acc); +} + +int bpf_devices_allow_list_major( + BPFProgram *prog, + const char *path, + const char *name, + char type, + const char *acc) { + + unsigned maj; + int r; + + assert(path); + assert(acc); + assert(IN_SET(type, 'b', 'c')); + + if (streq(name, "*")) + /* If the name is a wildcard, then apply this list to all devices of this type */ + return allow_list_device_pattern(prog, path, type, NULL, NULL, acc); + + if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) + /* The name is numeric and suitable as major. In that case, let's take its major, and create + * the entry directly. */ + return allow_list_device_pattern(prog, path, type, &maj, NULL, acc); + + _cleanup_fclose_ FILE *f = NULL; + bool good = false, any = false; + + f = fopen("/proc/devices", "re"); + if (!f) + return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s: %m", name); + + for (;;) { + _cleanup_free_ char *line = NULL; + char *w, *p; + + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return log_warning_errno(r, "Failed to read /proc/devices: %m"); + if (r == 0) + break; + + if (type == 'c' && streq(line, "Character devices:")) { + good = true; + continue; + } + + if (type == 'b' && streq(line, "Block devices:")) { + good = true; + continue; + } + + if (isempty(line)) { + good = false; + continue; + } + + if (!good) + continue; + + p = strstrip(line); + + w = strpbrk(p, WHITESPACE); + if (!w) + continue; + *w = 0; + + r = safe_atou(p, &maj); + if (r < 0) + continue; + if (maj <= 0) + continue; + + w++; + w += strspn(w, WHITESPACE); + + if (fnmatch(name, w, 0) != 0) + continue; + + any = true; + (void) allow_list_device_pattern(prog, path, type, &maj, NULL, acc); + } + + if (!any) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "Device allow list pattern \"%s\" did not match anything.", name); + + return 0; +} + +int bpf_devices_allow_list_static( + BPFProgram *prog, + const char *path) { + + static const char auto_devices[] = + "/dev/null\0" "rwm\0" + "/dev/zero\0" "rwm\0" + "/dev/full\0" "rwm\0" + "/dev/random\0" "rwm\0" + "/dev/urandom\0" "rwm\0" + "/dev/tty\0" "rwm\0" + "/dev/ptmx\0" "rwm\0" + /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */ + "/run/systemd/inaccessible/chr\0" "rwm\0" + "/run/systemd/inaccessible/blk\0" "rwm\0"; + int r = 0, k; + + const char *node, *acc; + NULSTR_FOREACH_PAIR(node, acc, auto_devices) { + k = bpf_devices_allow_list_device(prog, path, node, acc); + if (r >= 0 && k < 0) + r = k; + } + + /* PTS (/dev/pts) devices may not be duplicated, but accessed */ + k = bpf_devices_allow_list_major(prog, path, "pts", 'c', "rw"); + if (r >= 0 && k < 0) + r = k; + + return r; +} diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h new file mode 100644 index 0000000..5106364 --- /dev/null +++ b/src/core/bpf-devices.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> + +#include "cgroup.h" + +typedef struct BPFProgram BPFProgram; + +int bpf_devices_cgroup_init(BPFProgram **ret, CGroupDevicePolicy policy, bool allow_list); +int bpf_devices_apply_policy( + BPFProgram **prog, + CGroupDevicePolicy policy, + bool allow_list, + const char *cgroup_path, + BPFProgram **prog_installed); + +int bpf_devices_supported(void); +int bpf_devices_allow_list_device(BPFProgram *prog, const char *path, const char *node, const char *acc); +int bpf_devices_allow_list_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc); +int bpf_devices_allow_list_static(BPFProgram *prog, const char *path); diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c new file mode 100644 index 0000000..ce3b76c --- /dev/null +++ b/src/core/bpf-firewall.c @@ -0,0 +1,969 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <arpa/inet.h> +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/bpf_insn.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bpf-firewall.h" +#include "bpf-program.h" +#include "fd-util.h" +#include "in-addr-prefix-util.h" +#include "memory-util.h" +#include "missing_syscall.h" +#include "unit.h" +#include "strv.h" +#include "virt.h" + +enum { + MAP_KEY_PACKETS, + MAP_KEY_BYTES, +}; + +enum { + ACCESS_ALLOWED = 1, + ACCESS_DENIED = 2, +}; + +/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */ + +static int add_lookup_instructions( + BPFProgram *p, + int map_fd, + int protocol, + bool is_ingress, + int verdict) { + + int r, addr_offset, addr_size; + + assert(p); + assert(map_fd >= 0); + + switch (protocol) { + + case ETH_P_IP: + addr_size = sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct iphdr, saddr) : + offsetof(struct iphdr, daddr); + break; + + case ETH_P_IPV6: + addr_size = 4 * sizeof(uint32_t); + addr_offset = is_ingress ? + offsetof(struct ip6_hdr, ip6_src.s6_addr) : + offsetof(struct ip6_hdr, ip6_dst.s6_addr); + break; + + default: + return -EAFNOSUPPORT; + } + + do { + /* Compare IPv4 with one word instruction (32bit) */ + struct bpf_insn insn[] = { + /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0), + + /* + * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address + * + * R1: Pointer to the skb + * R2: Data offset + * R3: Destination buffer on the stack (r10 - 4) + * R4: Number of bytes to read (4) + */ + + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV32_IMM(BPF_REG_2, addr_offset), + + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size), + + BPF_MOV32_IMM(BPF_REG_4, addr_size), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes), + + /* + * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the + * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key' + * has to be set to the maximum possible value. + * + * On success, the looked up value is stored in R0. For this application, the actual + * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any + * matching value. + */ + + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)), + BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8), + + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + + } while (false); + + return 0; +} + +static int add_instructions_for_ip_any( + BPFProgram *p, + int verdict) { + int r; + + assert(p); + + const struct bpf_insn insn[] = { + BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict), + }; + + r = bpf_program_add_instructions(p, insn, 1); + if (r < 0) + return r; + + return 0; +} + +static int bpf_firewall_compile_bpf( + Unit *u, + const char *prog_name, + bool is_ingress, + BPFProgram **ret, + bool ip_allow_any, + bool ip_deny_any) { + + const struct bpf_insn pre_insn[] = { + /* + * When the eBPF program is entered, R1 contains the address of the skb. + * However, R1-R5 are scratch registers that are not preserved when calling + * into kernel functions, so we need to save anything that's supposed to + * stay around to R6-R9. Save the skb to R6. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* + * Although we cannot access the skb data directly from eBPF programs used in this + * scenario, the kernel has prepared some fields for us to access through struct __sk_buff. + * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7 + * for later use. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)), + + /* + * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet + * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning. + */ + BPF_MOV32_IMM(BPF_REG_8, 0), + }; + + /* + * The access checkers compiled for the configured allowance and denial lists + * write to R8 at runtime. The following code prepares for an early exit that + * skip the accounting if the packet is denied. + * + * R0 = 1 + * if (R8 == ACCESS_DENIED) + * R0 = 0 + * + * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet + * is allowed to pass. + */ + const struct bpf_insn post_insn[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + }; + + _cleanup_(bpf_program_freep) BPFProgram *p = NULL; + int accounting_map_fd, r; + bool access_enabled; + + assert(u); + assert(ret); + + accounting_map_fd = is_ingress ? + u->ip_accounting_ingress_map_fd : + u->ip_accounting_egress_map_fd; + + access_enabled = + u->ipv4_allow_map_fd >= 0 || + u->ipv6_allow_map_fd >= 0 || + u->ipv4_deny_map_fd >= 0 || + u->ipv6_deny_map_fd >= 0 || + ip_allow_any || + ip_deny_any; + + if (accounting_map_fd < 0 && !access_enabled) { + *ret = NULL; + return 0; + } + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p); + if (r < 0) + return r; + + r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn)); + if (r < 0) + return r; + + if (access_enabled) { + /* + * The simple rule this function translates into eBPF instructions is: + * + * - Access will be granted when an address matches an entry in @list_allow + * - Otherwise, access will be denied when an address matches an entry in @list_deny + * - Otherwise, access will be granted + */ + + if (u->ipv4_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv6_deny_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED); + if (r < 0) + return r; + } + + if (u->ipv4_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (u->ipv6_allow_map_fd >= 0) { + r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (ip_allow_any) { + r = add_instructions_for_ip_any(p, ACCESS_ALLOWED); + if (r < 0) + return r; + } + + if (ip_deny_any) { + r = add_instructions_for_ip_any(p, ACCESS_DENIED); + if (r < 0) + return r; + } + } + + r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn)); + if (r < 0) + return r; + + if (accounting_map_fd >= 0) { + struct bpf_insn insn[] = { + /* + * If R0 == 0, the packet will be denied; skip the accounting instructions in this case. + * The jump label will be fixed up later. + */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0), + + /* Count packets */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Count bytes */ + BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + /* Allow the packet to pass */ + BPF_MOV64_IMM(BPF_REG_0, 1), + }; + + /* Jump label fixup */ + insn[0].off = ELEMENTSOF(insn) - 1; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } + + do { + /* + * Exit from the eBPF program, R0 contains the verdict. + * 0 means the packet is denied, 1 means the packet may pass. + */ + const struct bpf_insn insn[] = { + BPF_EXIT_INSN() + }; + + r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn)); + if (r < 0) + return r; + } while (false); + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) { + struct in_addr_prefix *a; + + assert(n_ipv4); + assert(n_ipv6); + + SET_FOREACH(a, prefixes) + switch (a->family) { + + case AF_INET: + (*n_ipv4)++; + break; + + case AF_INET6: + (*n_ipv6)++; + break; + + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +static int bpf_firewall_add_access_items( + Set *prefixes, + int ipv4_map_fd, + int ipv6_map_fd, + int verdict) { + + struct bpf_lpm_trie_key *key_ipv4, *key_ipv6; + struct in_addr_prefix *a; + uint64_t value = verdict; + int r; + + key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)); + key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4); + + SET_FOREACH(a, prefixes) + switch (a->family) { + + case AF_INET: + key_ipv4->prefixlen = a->prefixlen; + memcpy(key_ipv4->data, &a->address, sizeof(uint32_t)); + + r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value); + if (r < 0) + return r; + + break; + + case AF_INET6: + key_ipv6->prefixlen = a->prefixlen; + memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t)); + + r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value); + if (r < 0) + return r; + + break; + + default: + return -EAFNOSUPPORT; + } + + return 0; +} + +static int bpf_firewall_prepare_access_maps( + Unit *u, + int verdict, + int *ret_ipv4_map_fd, + int *ret_ipv6_map_fd, + bool *ret_has_any) { + + _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1; + size_t n_ipv4 = 0, n_ipv6 = 0; + Unit *p; + int r; + + assert(ret_ipv4_map_fd); + assert(ret_ipv6_map_fd); + assert(ret_has_any); + + for (p = u; p; p = UNIT_GET_SLICE(p)) { + CGroupContext *cc; + Set *prefixes; + bool *reduced; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny; + reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced; + + if (!*reduced) { + r = in_addr_prefixes_reduce(prefixes); + if (r < 0) + return r; + + *reduced = true; + } + + bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6); + + /* Skip making the LPM trie map in cases where we are using "any" in order to hack around + * needing CAP_SYS_ADMIN for allocating LPM trie map. */ + if (in_addr_prefixes_is_any(prefixes)) { + *ret_has_any = true; + return 0; + } + } + + if (n_ipv4 > 0) { + ipv4_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t), + sizeof(uint64_t), + n_ipv4, + BPF_F_NO_PREALLOC); + if (ipv4_map_fd < 0) + return ipv4_map_fd; + } + + if (n_ipv6 > 0) { + ipv6_map_fd = bpf_map_new( + BPF_MAP_TYPE_LPM_TRIE, + offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4, + sizeof(uint64_t), + n_ipv6, + BPF_F_NO_PREALLOC); + if (ipv6_map_fd < 0) + return ipv6_map_fd; + } + + for (p = u; p; p = UNIT_GET_SLICE(p)) { + CGroupContext *cc; + + cc = unit_get_cgroup_context(p); + if (!cc) + continue; + + r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, + ipv4_map_fd, ipv6_map_fd, verdict); + if (r < 0) + return r; + } + + *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd); + *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd); + *ret_has_any = false; + return 0; +} + +static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) { + int r; + + assert(u); + assert(fd_ingress); + assert(fd_egress); + + if (enabled) { + if (*fd_ingress < 0) { + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_ingress = r; + } + + if (*fd_egress < 0) { + + r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0); + if (r < 0) + return r; + + *fd_egress = r; + } + + } else { + *fd_ingress = safe_close(*fd_ingress); + *fd_egress = safe_close(*fd_egress); + + zero(u->ip_accounting_extra); + } + + return 0; +} + +int bpf_firewall_compile(Unit *u) { + const char *ingress_name = NULL, *egress_name = NULL; + bool ip_allow_any = false, ip_deny_any = false; + CGroupContext *cc; + int r, supported; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF firewalling not supported, proceeding without."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) + /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice + * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption + * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is + * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at + * all, either. */ + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units."); + + /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15 + * kernel). */ + if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) { + ingress_name = "sd_fw_ingress"; + egress_name = "sd_fw_egress"; + } + + /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves, + * but we reuse the accounting maps. That way the firewall in effect always maps to the actual + * configuration, but we don't flush out the accounting unnecessarily */ + + u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress); + u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + if (u->type != UNIT_SLICE) { + /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf + * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that + * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this + * means that all configure IP access rules *will* take effect on processes, even though we never + * compile them for inner nodes. */ + + r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m"); + + r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m"); + } + + r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m"); + + r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m"); + + r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m"); + + return 0; +} + +static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) { + set_clear(*set); + + STRV_FOREACH(bpf_fs_path, filter_paths) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + int r; + + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m"); + + r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path); + + r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog)); + if (r < 0) + return log_oom(); + } + + return 0; +} + +int bpf_firewall_load_custom(Unit *u) { + CGroupContext *cc; + int r, supported; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + if (!(cc->ip_filters_ingress || cc->ip_filters_egress)) + return 0; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs."); + + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress); + if (r < 0) + return r; + r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress); + if (r < 0) + return r; + + return 0; +} + +static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) { + BPFProgram *prog; + int r; + + assert(u); + + set_clear(*set_installed); + r = set_ensure_allocated(set_installed, &bpf_program_hash_ops); + if (r < 0) + return log_oom(); + + SET_FOREACH_MOVE(prog, *set_installed, *set) { + r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path); + } + return 0; +} + +int bpf_firewall_install(Unit *u) { + _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL; + _cleanup_free_ char *path = NULL; + CGroupContext *cc; + int r, supported; + uint32_t flags; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return -EINVAL; + if (!u->cgroup_path) + return -EINVAL; + if (!u->cgroup_realized) + return -EINVAL; + + supported = bpf_firewall_supported(); + if (supported < 0) + return supported; + if (supported == BPF_FIREWALL_UNSUPPORTED) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF firewalling not supported, proceeding without."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units."); + if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && + (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress))) + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs."); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m"); + + flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0; + + if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) { + /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only + * after attaching the new programs, so that there's no time window where neither program is + * attached. (There will be a program where both are attached, but that's OK, since this is a + * security feature where we rather want to lock down too much than too little */ + ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed); + ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed); + } else { + /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly + * detach them) right before attaching the new program, to minimize the time window when we + * don't account for IP traffic. */ + u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed); + u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed); + } + + if (u->ip_bpf_egress) { + r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags); + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path); + + /* Remember that this BPF program is installed now. */ + u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress); + } + + if (u->ip_bpf_ingress) { + r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags); + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path); + + u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress); + } + + /* And now, definitely get rid of the old programs, and detach them */ + ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall); + ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall); + + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed); + if (r < 0) + return r; + + r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed); + if (r < 0) + return r; + + return 0; +} + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) { + uint64_t key, packets; + int r; + + if (map_fd < 0) + return -EBADF; + + if (ret_packets) { + key = MAP_KEY_PACKETS; + r = bpf_map_lookup_element(map_fd, &key, &packets); + if (r < 0) + return r; + } + + if (ret_bytes) { + key = MAP_KEY_BYTES; + r = bpf_map_lookup_element(map_fd, &key, ret_bytes); + if (r < 0) + return r; + } + + if (ret_packets) + *ret_packets = packets; + + return 0; +} + +int bpf_firewall_reset_accounting(int map_fd) { + uint64_t key, value = 0; + int r; + + if (map_fd < 0) + return -EBADF; + + key = MAP_KEY_PACKETS; + r = bpf_map_update_element(map_fd, &key, &value); + if (r < 0) + return r; + + key = MAP_KEY_BYTES; + return bpf_map_update_element(map_fd, &key, &value); +} + +static int bpf_firewall_unsupported_reason = 0; + +int bpf_firewall_supported(void) { + const struct bpf_insn trivial[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN() + }; + + _cleanup_(bpf_program_freep) BPFProgram *program = NULL; + static int supported = -1; + union bpf_attr attr; + int r; + + /* Checks whether BPF firewalling is supported. For this, we check the following things: + * + * - whether the unified hierarchy is being used + * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require + * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require + */ + if (supported >= 0) + return supported; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) + return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m"); + if (r == 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN), + "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* prog_name is NULL since it is supported only starting from v4.15 kernel. */ + r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial)); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + r = bpf_program_load_kernel(program, NULL, 0); + if (r < 0) { + bpf_firewall_unsupported_reason = + log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF + * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF + * program if we can't do a thing with it later? + * + * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if + * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the + * parameters are validated however, and that'll fail with EBADF then. */ + + // FIXME: Clang doesn't 0-pad with structured initialization, causing + // the kernel to reject the bpf_attr as invalid. See: + // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65 + // Ideally it should behave like GCC, so that we can remove these workarounds. + zero(attr); + attr.attach_type = BPF_CGROUP_INET_EGRESS; + attr.target_fd = -1; + attr.attach_bpf_fd = -1; + + if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) { + if (errno != EBADF) { + bpf_firewall_unsupported_reason = + log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m"); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* YAY! */ + } else { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EBADE), + "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? " + "Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } + + /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported + * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH + * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll + * get EINVAL if it's not supported, and EBADF as before if it is available. + * Use probe result as the indicator that program name is also supported since they both were + * added in kernel 4.15. */ + + zero(attr); + attr.attach_type = BPF_CGROUP_INET_EGRESS; + attr.target_fd = -1; + attr.attach_bpf_fd = -1; + attr.attach_flags = BPF_F_ALLOW_MULTI; + + if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) { + if (errno == EBADF) { + log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!"); + return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI; + } + + if (errno == EINVAL) + log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported."); + else + log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m"); + + return supported = BPF_FIREWALL_SUPPORTED; + } else { + bpf_firewall_unsupported_reason = + log_debug_errno(SYNTHETIC_ERRNO(EBADE), + "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? " + "Something is weird, assuming BPF firewalling is broken and hence not supported."); + return supported = BPF_FIREWALL_UNSUPPORTED; + } +} + +void emit_bpf_firewall_warning(Unit *u) { + static bool warned = false; + + assert(u); + assert(u->manager); + + if (warned || MANAGER_IS_TEST_RUN(u->manager)) + return; + + bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0; + + log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason, + "unit configures an IP firewall, but %s.\n" + "(This warning is only shown for the first unit using IP firewalling.)", + getuid() != 0 ? "not running as root" : + "the local system does not support BPF/cgroup firewalling"); + warned = true; +} + +void bpf_firewall_close(Unit *u) { + assert(u); + + u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd); + u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd); + + u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd); + u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd); + u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd); + u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd); + + u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress); + u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed); + u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress); + u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed); + + u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress); + u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress); + u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed); + u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed); +} diff --git a/src/core/bpf-firewall.h b/src/core/bpf-firewall.h new file mode 100644 index 0000000..58b401f --- /dev/null +++ b/src/core/bpf-firewall.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include <inttypes.h> + +#include "unit.h" + +enum { + BPF_FIREWALL_UNSUPPORTED = 0, + BPF_FIREWALL_SUPPORTED = 1, + BPF_FIREWALL_SUPPORTED_WITH_MULTI = 2, +}; + +int bpf_firewall_supported(void); + +int bpf_firewall_compile(Unit *u); +int bpf_firewall_install(Unit *u); +int bpf_firewall_load_custom(Unit *u); + +int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets); +int bpf_firewall_reset_accounting(int map_fd); + +void emit_bpf_firewall_warning(Unit *u); + +void bpf_firewall_close(Unit *u); diff --git a/src/core/bpf-foreign.c b/src/core/bpf-foreign.c new file mode 100644 index 0000000..cff2f61 --- /dev/null +++ b/src/core/bpf-foreign.c @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-foreign.h" +#include "bpf-program.h" +#include "cgroup.h" +#include "memory-util.h" +#include "missing_magic.h" +#include "mountpoint-util.h" +#include "set.h" +#include "stat-util.h" + +typedef struct BPFForeignKey BPFForeignKey; +struct BPFForeignKey { + uint32_t prog_id; + uint32_t attach_type; +}; + +static int bpf_foreign_key_new(uint32_t prog_id, + enum bpf_attach_type attach_type, + BPFForeignKey **ret) { + _cleanup_free_ BPFForeignKey *p = NULL; + + assert(ret); + + p = new(BPFForeignKey, 1); + if (!p) + return -ENOMEM; + + *p = (BPFForeignKey) { + .prog_id = prog_id, + .attach_type = attach_type, + }; + + *ret = TAKE_PTR(p); + + return 0; +} + +static int bpf_foreign_key_compare_func(const BPFForeignKey *a, const BPFForeignKey *b) { + int r = CMP(a->prog_id, b->prog_id); + if (r != 0) + return r; + + return CMP(a->attach_type, b->attach_type); +} + +static void bpf_foreign_key_hash_func(const BPFForeignKey *p, struct siphash *h) { + siphash24_compress(&p->prog_id, sizeof(p->prog_id), h); + siphash24_compress(&p->attach_type, sizeof(p->attach_type), h); +} + +DEFINE_PRIVATE_HASH_OPS_FULL(bpf_foreign_by_key_hash_ops, + BPFForeignKey, bpf_foreign_key_hash_func, bpf_foreign_key_compare_func, free, + BPFProgram, bpf_program_free); + +static int attach_programs(Unit *u, const char *path, Hashmap* foreign_by_key, uint32_t attach_flags) { + const BPFForeignKey *key; + BPFProgram *prog; + int r, ret = 0; + + assert(u); + + HASHMAP_FOREACH_KEY(prog, key, foreign_by_key) { + r = bpf_program_cgroup_attach(prog, key->attach_type, path, attach_flags); + if (r < 0) { + log_unit_error_errno(u, r, "bpf-foreign: Attaching foreign BPF program to cgroup %s failed: %m", path); + if (ret >= 0) + ret = r; + } + } + + return ret; +} + +/* + * Prepare foreign BPF program for installation: + * - Load the program from BPF filesystem to the kernel; + * - Store program FD identified by program ID and attach type in the unit. + */ +static int bpf_foreign_prepare( + Unit *u, + enum bpf_attach_type attach_type, + const char *bpffs_path) { + _cleanup_(bpf_program_freep) BPFProgram *prog = NULL; + _cleanup_free_ BPFForeignKey *key = NULL; + uint32_t prog_id; + int r; + + assert(u); + assert(bpffs_path); + + r = path_is_fs_type(bpffs_path, BPF_FS_MAGIC); + if (r == -ENOENT) { + log_unit_warning_errno(u, r, "bpf-foreign: foreign program %s does not exist, skipping.", bpffs_path); + return 0; + } + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-foreign: Failed to determine filesystem type of %s: %m", bpffs_path); + if (r == 0) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "bpf-foreign: Path in BPF filesystem is expected."); + + r = bpf_program_new_from_bpffs_path(bpffs_path, &prog); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to create foreign BPF program: %m"); + + r = bpf_program_get_id_by_fd(prog->kernel_fd, &prog_id); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to get BPF program id from fd: %m"); + + r = bpf_foreign_key_new(prog_id, attach_type, &key); + if (r < 0) + return log_unit_error_errno(u, r, + "bpf-foreign: Failed to create foreign BPF program key from path '%s': %m", bpffs_path); + + r = hashmap_ensure_put(&u->bpf_foreign_by_key, &bpf_foreign_by_key_hash_ops, key, prog); + if (r == -EEXIST) { + log_unit_warning_errno(u, r, "bpf-foreign: Foreign BPF program already exists, ignoring: %m"); + return 0; + } + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to put foreign BPF program into map: %m"); + + TAKE_PTR(key); + TAKE_PTR(prog); + + return 0; +} + +int bpf_foreign_install(Unit *u) { + _cleanup_free_ char *cgroup_path = NULL; + CGroupContext *cc; + int r, ret = 0; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-foreign: Failed to get cgroup path: %m"); + + LIST_FOREACH(programs, p, cc->bpf_foreign_programs) { + r = bpf_foreign_prepare(u, p->attach_type, p->bpffs_path); + if (r < 0 && ret >= 0) + ret = r; + } + + r = attach_programs(u, cgroup_path, u->bpf_foreign_by_key, BPF_F_ALLOW_MULTI); + return ret < 0 ? ret : r; +} diff --git a/src/core/bpf-foreign.h b/src/core/bpf-foreign.h new file mode 100644 index 0000000..e387b1b --- /dev/null +++ b/src/core/bpf-foreign.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#pragma once + +#include "unit.h" + +static inline int bpf_foreign_supported(void) { + return cg_all_unified(); +} + +/* + * Attach cgroup-bpf programs foreign to systemd, i.e. loaded to the kernel by an entity + * external to systemd. + */ +int bpf_foreign_install(Unit *u); diff --git a/src/core/bpf-lsm.c b/src/core/bpf-lsm.c new file mode 100644 index 0000000..a3726d9 --- /dev/null +++ b/src/core/bpf-lsm.c @@ -0,0 +1,364 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <errno.h> +#include <fcntl.h> +#include <linux/types.h> +#include <sys/resource.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <unistd.h> + +#include "alloc-util.h" +#include "bpf-lsm.h" +#include "cgroup-util.h" +#include "fd-util.h" +#include "fileio.h" +#include "filesystems.h" +#include "log.h" +#include "manager.h" +#include "mkdir.h" +#include "nulstr-util.h" +#include "stat-util.h" +#include "strv.h" + +#if BPF_FRAMEWORK +/* libbpf, clang and llc compile time dependencies are satisfied */ +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/restrict_fs/restrict-fs-skel.h" + +#define CGROUP_HASH_SIZE_MAX 2048 + +static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) { + /* restrict_fs_bpf__destroy handles object == NULL case */ + (void) restrict_fs_bpf__destroy(obj); + + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free); + +static bool bpf_can_link_lsm_program(struct bpf_program *prog) { + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + + assert(prog); + + link = sym_bpf_program__attach_lsm(prog); + + /* If bpf_program__attach_lsm fails the resulting value stores libbpf error code instead of memory + * pointer. That is the case when the helper is called on architectures where BPF trampoline (hence + * BPF_LSM_MAC attach type) is not supported. */ + return sym_libbpf_get_error(link) == 0; +} + +static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + _cleanup_close_ int inner_map_fd = -1; + int r; + + assert(ret_obj); + + obj = restrict_fs_bpf__open(); + if (!obj) + return log_error_errno(errno, "bpf-lsm: Failed to open BPF object: %m"); + + /* TODO Maybe choose a number based on runtime information? */ + r = sym_bpf_map__set_max_entries(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-lsm: Failed to resize BPF map '%s': %m", + sym_bpf_map__name(obj->maps.cgroup_hash)); + + /* Dummy map to satisfy the verifier */ + inner_map_fd = compat_bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(uint32_t), sizeof(uint32_t), 128U, NULL); + if (inner_map_fd < 0) + return log_error_errno(errno, "bpf-lsm: Failed to create BPF map: %m"); + + r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-lsm: Failed to set inner map fd: %m"); + + r = restrict_fs_bpf__load(obj); + assert(r <= 0); + if (r < 0) + return log_error_errno(r, "bpf-lsm: Failed to load BPF object: %m"); + + *ret_obj = TAKE_PTR(obj); + + return 0; +} + +static int mac_bpf_use(void) { + _cleanup_free_ char *lsm_list = NULL; + static int cached_use = -1; + int r; + + if (cached_use >= 0) + return cached_use; + + cached_use = 0; + + r = read_one_line_file("/sys/kernel/security/lsm", &lsm_list); + if (r < 0) { + if (r != -ENOENT) + log_notice_errno(r, "bpf-lsm: Failed to read /sys/kernel/security/lsm, assuming bpf is unavailable: %m"); + return 0; + } + + for (const char *p = lsm_list;;) { + _cleanup_free_ char *word = NULL; + + r = extract_first_word(&p, &word, ",", 0); + if (r == 0) + return 0; + if (r == -ENOMEM) + return log_oom(); + if (r < 0) { + log_notice_errno(r, "bpf-lsm: Failed to parse /sys/kernel/security/lsm, assuming bpf is unavailable: %m"); + return 0; + } + + if (streq(word, "bpf")) + return cached_use = 1; + } +} + +bool lsm_bpf_supported(bool initialize) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + if (!initialize) + return false; + + if (!cgroup_bpf_supported()) + return (supported = false); + + r = mac_bpf_use(); + if (r < 0) { + log_warning_errno(r, "bpf-lsm: Can't determine whether the BPF LSM module is used: %m"); + return (supported = false); + } + + if (r == 0) { + log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported"); + return (supported = false); + } + + r = prepare_restrict_fs_bpf(&obj); + if (r < 0) + return (supported = false); + + if (!bpf_can_link_lsm_program(obj->progs.restrict_filesystems)) { + log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-lsm: Failed to link program; assuming BPF LSM is not available"); + return (supported = false); + } + + return (supported = true); +} + +int lsm_bpf_setup(Manager *m) { + _cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL; + _cleanup_(bpf_link_freep) struct bpf_link *link = NULL; + int r; + + assert(m); + + r = prepare_restrict_fs_bpf(&obj); + if (r < 0) + return r; + + link = sym_bpf_program__attach_lsm(obj->progs.restrict_filesystems); + r = sym_libbpf_get_error(link); + if (r != 0) + return log_error_errno(r, "bpf-lsm: Failed to link '%s' LSM BPF program: %m", + sym_bpf_program__name(obj->progs.restrict_filesystems)); + + log_info("bpf-lsm: LSM BPF program attached"); + + obj->links.restrict_filesystems = TAKE_PTR(link); + m->restrict_fs = TAKE_PTR(obj); + + return 0; +} + +int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, bool allow_list) { + uint32_t dummy_value = 1, zero = 0; + const char *fs; + const statfs_f_type_t *magic; + int r; + + assert(filesystems); + assert(u); + + if (!u->manager->restrict_fs) + return log_unit_error_errno(u, SYNTHETIC_ERRNO(EINVAL), + "bpf-lsm: BPF LSM object is not installed, has setup failed?"); + + int inner_map_fd = compat_bpf_map_create( + BPF_MAP_TYPE_HASH, + NULL, + sizeof(uint32_t), + sizeof(uint32_t), + 128U, /* Should be enough for all filesystem types */ + NULL); + if (inner_map_fd < 0) + return log_unit_error_errno(u, errno, "bpf-lsm: Failed to create inner BPF map: %m"); + + int outer_map_fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash); + if (outer_map_fd < 0) + return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m"); + + if (sym_bpf_map_update_elem(outer_map_fd, &u->cgroup_id, &inner_map_fd, BPF_ANY) != 0) + return log_unit_error_errno(u, errno, "bpf-lsm: Error populating BPF map: %m"); + + uint32_t allow = allow_list; + + /* Use key 0 to store whether this is an allow list or a deny list */ + if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0) + return log_unit_error_errno(u, errno, "bpf-lsm: Error initializing map: %m"); + + SET_FOREACH(fs, filesystems) { + r = fs_type_from_string(fs, &magic); + if (r < 0) { + log_unit_warning(u, "bpf-lsm: Invalid filesystem name '%s', ignoring.", fs); + continue; + } + + log_unit_debug(u, "bpf-lsm: Restricting filesystem access to '%s'", fs); + + for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) { + if (magic[i] == 0) + break; + + if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) { + r = log_unit_error_errno(u, errno, "bpf-lsm: Failed to update BPF map: %m"); + + if (sym_bpf_map_delete_elem(outer_map_fd, &u->cgroup_id) != 0) + log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from BPF map: %m"); + + return r; + } + } + } + + return 0; +} + +int lsm_bpf_cleanup(const Unit *u) { + assert(u); + assert(u->manager); + + /* If we never successfully detected support, there is nothing to clean up. */ + if (!lsm_bpf_supported(/* initialize = */ false)) + return 0; + + if (!u->manager->restrict_fs) + return 0; + + if (u->cgroup_id == 0) + return 0; + + int fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash); + if (fd < 0) + return log_unit_error_errno(u, errno, "bpf-lsm: Failed to get BPF map fd: %m"); + + if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0 && errno != ENOENT) + return log_unit_debug_errno(u, errno, "bpf-lsm: Failed to delete cgroup entry from LSM BPF map: %m"); + + return 0; +} + +int lsm_bpf_map_restrict_fs_fd(Unit *unit) { + assert(unit); + assert(unit->manager); + + if (!unit->manager->restrict_fs) + return -ENOMEDIUM; + + return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash); +} + +void lsm_bpf_destroy(struct restrict_fs_bpf *prog) { + restrict_fs_bpf__destroy(prog); +} +#else /* ! BPF_FRAMEWORK */ +bool lsm_bpf_supported(bool initialize) { + return false; +} + +int lsm_bpf_setup(Manager *m) { + return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to set up LSM BPF: %m"); +} + +int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, const bool allow_list) { + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "bpf-lsm: Failed to restrict filesystems using LSM BPF: %m"); +} + +int lsm_bpf_cleanup(const Unit *u) { + return 0; +} + +int lsm_bpf_map_restrict_fs_fd(Unit *unit) { + return -ENOMEDIUM; +} + +void lsm_bpf_destroy(struct restrict_fs_bpf *prog) { + return; +} +#endif + +int lsm_bpf_parse_filesystem( + const char *name, + Set **filesystems, + FilesystemParseFlags flags, + const char *unit, + const char *filename, + unsigned line) { + int r; + + assert(name); + assert(filesystems); + + if (name[0] == '@') { + const FilesystemSet *set; + const char *i; + + set = filesystem_set_find(name); + if (!set) { + log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0, + "bpf-lsm: Unknown filesystem group, ignoring: %s", name); + return 0; + } + + NULSTR_FOREACH(i, set->value) { + /* Call ourselves again, for the group to parse. Note that we downgrade logging here + * (i.e. take away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table + * are our own problem, not a problem in user configuration data and we shouldn't + * pretend otherwise by complaining about them. */ + r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line); + if (r < 0) + return r; + } + } else { + /* If we previously wanted to forbid access to a filesystem and now + * we want to allow it, then remove it from the list. */ + if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) { + r = set_put_strdup(filesystems, name); + if (r == -ENOMEM) + return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM; + if (r < 0 && r != -EEXIST) /* When already in set, ignore */ + return r; + } else + free(set_remove(*filesystems, name)); + } + + return 0; +} diff --git a/src/core/bpf-lsm.h b/src/core/bpf-lsm.h new file mode 100644 index 0000000..dff5812 --- /dev/null +++ b/src/core/bpf-lsm.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "hashmap.h" + +typedef enum FilesystemParseFlags { + FILESYSTEM_PARSE_INVERT = 1 << 0, + FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1, + FILESYSTEM_PARSE_LOG = 1 << 2, +} FilesystemParseFlags; + +typedef struct Unit Unit; +typedef struct Manager Manager; + +typedef struct restrict_fs_bpf restrict_fs_bpf; + +bool lsm_bpf_supported(bool initialize); +int lsm_bpf_setup(Manager *m); +int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, bool allow_list); +int lsm_bpf_cleanup(const Unit *u); +int lsm_bpf_map_restrict_fs_fd(Unit *u); +void lsm_bpf_destroy(struct restrict_fs_bpf *prog); +int lsm_bpf_parse_filesystem(const char *name, + Set **filesystems, + FilesystemParseFlags flags, + const char *unit, + const char *filename, + unsigned line); diff --git a/src/core/bpf-socket-bind.c b/src/core/bpf-socket-bind.c new file mode 100644 index 0000000..660ffdb --- /dev/null +++ b/src/core/bpf-socket-bind.c @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#if BPF_FRAMEWORK +#include <bpf/bpf.h> +#endif + +#include "fd-util.h" +#include "bpf-socket-bind.h" + +#if BPF_FRAMEWORK +/* libbpf, clang, llvm and bpftool compile time dependencies are satisfied */ +#include "bpf-dlopen.h" +#include "bpf-link.h" +#include "bpf-util.h" +#include "bpf/socket_bind/socket-bind-api.bpf.h" +#include "bpf/socket_bind/socket-bind-skel.h" + +static struct socket_bind_bpf *socket_bind_bpf_free(struct socket_bind_bpf *obj) { + /* socket_bind_bpf__destroy handles object == NULL case */ + (void) socket_bind_bpf__destroy(obj); + + return NULL; +} + +DEFINE_TRIVIAL_CLEANUP_FUNC(struct socket_bind_bpf *, socket_bind_bpf_free); + +static int update_rules_map( + int map_fd, + CGroupSocketBindItem *head) { + + uint32_t i = 0; + + assert(map_fd >= 0); + + LIST_FOREACH(socket_bind_items, item, head) { + struct socket_bind_rule val = { + .address_family = (uint32_t) item->address_family, + .protocol = item->ip_protocol, + .nr_ports = item->nr_ports, + .port_min = item->port_min, + }; + + uint32_t key = i++; + + if (sym_bpf_map_update_elem(map_fd, &key, &val, BPF_ANY) != 0) + return -errno; + } + + return 0; +} + +static int prepare_socket_bind_bpf( + Unit *u, + CGroupSocketBindItem *allow, + CGroupSocketBindItem *deny, + struct socket_bind_bpf **ret_obj) { + + _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL; + size_t allow_count = 0, deny_count = 0; + int allow_map_fd, deny_map_fd, r; + + assert(ret_obj); + + LIST_FOREACH(socket_bind_items, item, allow) + allow_count++; + + LIST_FOREACH(socket_bind_items, item, deny) + deny_count++; + + if (allow_count > SOCKET_BIND_MAX_RULES) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL), + "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES); + + if (deny_count > SOCKET_BIND_MAX_RULES) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, SYNTHETIC_ERRNO(EINVAL), + "bpf-socket-bind: Maximum number of socket bind rules=%i is exceeded", SOCKET_BIND_MAX_RULES); + + obj = socket_bind_bpf__open(); + if (!obj) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, "bpf-socket-bind: Failed to open BPF object: %m"); + + if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_allow, MAX(allow_count, 1u)) != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, + "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_allow)); + + if (sym_bpf_map__set_max_entries(obj->maps.sd_bind_deny, MAX(deny_count, 1u)) != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, errno, + "bpf-socket-bind: Failed to resize BPF map '%s': %m", sym_bpf_map__name(obj->maps.sd_bind_deny)); + + if (socket_bind_bpf__load(obj) != 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_DEBUG, errno, + "bpf-socket-bind: Failed to load BPF object: %m"); + + allow_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_allow); + assert(allow_map_fd >= 0); + + r = update_rules_map(allow_map_fd, allow); + if (r < 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, + "bpf-socket-bind: Failed to put socket bind allow rules into BPF map '%s'", + sym_bpf_map__name(obj->maps.sd_bind_allow)); + + deny_map_fd = sym_bpf_map__fd(obj->maps.sd_bind_deny); + assert(deny_map_fd >= 0); + + r = update_rules_map(deny_map_fd, deny); + if (r < 0) + return log_unit_full_errno(u, u ? LOG_ERR : LOG_WARNING, r, + "bpf-socket-bind: Failed to put socket bind deny rules into BPF map '%s'", + sym_bpf_map__name(obj->maps.sd_bind_deny)); + + *ret_obj = TAKE_PTR(obj); + return 0; +} + +int bpf_socket_bind_supported(void) { + _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL; + int r; + + if (!cgroup_bpf_supported()) + return false; + + if (!compat_libbpf_probe_bpf_prog_type(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, /*opts=*/NULL)) { + log_debug("bpf-socket-bind: BPF program type cgroup_sock_addr is not supported"); + return false; + } + + r = prepare_socket_bind_bpf(/*unit=*/NULL, /*allow_rules=*/NULL, /*deny_rules=*/NULL, &obj); + if (r < 0) { + log_debug_errno(r, "bpf-socket-bind: socket bind filtering is not supported: %m"); + return false; + } + + return bpf_can_link_program(obj->progs.sd_bind4); +} + +int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) { + int r; + + assert(u); + + if (!u->initial_socket_bind_link_fds) { + u->initial_socket_bind_link_fds = fdset_new(); + if (!u->initial_socket_bind_link_fds) + return log_oom(); + } + + r = fdset_put(u->initial_socket_bind_link_fds, fd); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to put BPF fd %d to initial fdset", fd); + + return 0; +} + +static int socket_bind_install_impl(Unit *u) { + _cleanup_(bpf_link_freep) struct bpf_link *ipv4 = NULL, *ipv6 = NULL; + _cleanup_(socket_bind_bpf_freep) struct socket_bind_bpf *obj = NULL; + _cleanup_free_ char *cgroup_path = NULL; + _cleanup_close_ int cgroup_fd = -1; + CGroupContext *cc; + int r; + + assert(u); + + cc = unit_get_cgroup_context(u); + if (!cc) + return 0; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_path); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to get cgroup path: %m"); + + if (!cc->socket_bind_allow && !cc->socket_bind_deny) + return 0; + + r = prepare_socket_bind_bpf(u, cc->socket_bind_allow, cc->socket_bind_deny, &obj); + if (r < 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to load BPF object: %m"); + + cgroup_fd = open(cgroup_path, O_RDONLY | O_CLOEXEC, 0); + if (cgroup_fd < 0) + return log_unit_error_errno(u, errno, "bpf-socket-bind: Failed to open cgroup %s for reading: %m", cgroup_path); + + ipv4 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind4, cgroup_fd); + r = sym_libbpf_get_error(ipv4); + if (r != 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m", + sym_bpf_program__name(obj->progs.sd_bind4)); + + ipv6 = sym_bpf_program__attach_cgroup(obj->progs.sd_bind6, cgroup_fd); + r = sym_libbpf_get_error(ipv6); + if (r != 0) + return log_unit_error_errno(u, r, "bpf-socket-bind: Failed to link '%s' cgroup-bpf program: %m", + sym_bpf_program__name(obj->progs.sd_bind6)); + + u->ipv4_socket_bind_link = TAKE_PTR(ipv4); + u->ipv6_socket_bind_link = TAKE_PTR(ipv6); + + return 0; +} + +int bpf_socket_bind_install(Unit *u) { + int r; + + assert(u); + + r = socket_bind_install_impl(u); + if (r == -ENOMEM) + return r; + + fdset_close(u->initial_socket_bind_link_fds); + return r; +} + +int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) { + int r; + + assert(u); + + r = bpf_serialize_link(f, fds, "ipv4-socket-bind-bpf-link", u->ipv4_socket_bind_link); + if (r < 0) + return r; + + return bpf_serialize_link(f, fds, "ipv6-socket-bind-bpf-link", u->ipv6_socket_bind_link); +} + +#else /* ! BPF_FRAMEWORK */ +int bpf_socket_bind_supported(void) { + return false; +} + +int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd) { + return 0; +} + +int bpf_socket_bind_install(Unit *u) { + return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), + "bpf-socket-bind: Failed to install; BPF framework is not supported"); +} + +int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds) { + return 0; +} +#endif diff --git a/src/core/bpf-socket-bind.h b/src/core/bpf-socket-bind.h new file mode 100644 index 0000000..7d426df --- /dev/null +++ b/src/core/bpf-socket-bind.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ +#pragma once + +#include "fdset.h" +#include "unit.h" + +int bpf_socket_bind_supported(void); + +/* Add BPF link fd created before daemon-reload or daemon-reexec. FDs will be closed at the end of + * socket_bind_install. */ +int bpf_socket_bind_add_initial_link_fd(Unit *u, int fd); + +int bpf_socket_bind_install(Unit *u); + +int bpf_serialize_socket_bind(Unit *u, FILE *f, FDSet *fds); diff --git a/src/core/bpf-util.c b/src/core/bpf-util.c new file mode 100644 index 0000000..84170da --- /dev/null +++ b/src/core/bpf-util.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include "bpf-dlopen.h" +#include "bpf-util.h" +#include "cgroup-util.h" +#include "log.h" + +bool cgroup_bpf_supported(void) { + static int supported = -1; + int r; + + if (supported >= 0) + return supported; + + r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER); + if (r < 0) { + log_warning_errno(r, "Can't determine whether the unified hierarchy is used: %m"); + return (supported = false); + } + + if (r == 0) { + log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), + "Not running with unified cgroup hierarchy, disabling cgroup BPF features."); + return (supported = false); + } + + r = dlopen_bpf(); + if (r < 0) { + log_full_errno(in_initrd() ? LOG_DEBUG : LOG_INFO, + r, "Failed to open libbpf, cgroup BPF features disabled: %m"); + return (supported = false); + } + + return (supported = true); +} diff --git a/src/core/bpf-util.h b/src/core/bpf-util.h new file mode 100644 index 0000000..a6c55cd --- /dev/null +++ b/src/core/bpf-util.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include <stdbool.h> + +bool cgroup_bpf_supported(void); diff --git a/src/core/bpf/meson.build b/src/core/bpf/meson.build new file mode 100644 index 0000000..f654016 --- /dev/null +++ b/src/core/bpf/meson.build @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: LGPL-2.1+ + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +bpf_clang_flags = [ + '-std=gnu11', + '-Wno-compare-distinct-pointer-types', + '-O2', + '-target', + 'bpf', + '-g', + '-c', +] + +bpf_gcc_flags = [ + '-std=gnu11', + '-O2', + '-mkernel=5.2', + '-mcpu=v3', + '-mco-re', + '-gbtf', + '-c', +] + +# Generate defines that are appropriate to tell the compiler what architecture +# we're compiling for. By default we just map meson's cpu_family to __<cpu_family>__. +# This dictionary contains the exceptions where this doesn't work. +# +# C.f. https://mesonbuild.com/Reference-tables.html#cpu-families +# and src/basic/missing_syscall_def.h. +cpu_arch_defines = { + 'ppc' : ['-D__powerpc__'], + 'ppc64' : ['-D__powerpc64__', '-D_CALL_ELF=2'], + 'riscv32' : ['-D__riscv', '-D__riscv_xlen=32'], + 'riscv64' : ['-D__riscv', '-D__riscv_xlen=64'], + 'x86' : ['-D__i386__'], + + # For arm, assume hardware fp is available. + 'arm' : ['-D__arm__', '-D__ARM_PCS_VFP'], +} + +bpf_arch_flags = cpu_arch_defines.get(host_machine.cpu_family(), + ['-D__@0@__'.format(host_machine.cpu_family())]) +if bpf_compiler == 'gcc' + bpf_arch_flags += ['-m' + host_machine.endian() + '-endian'] +endif + +libbpf_include_dir = libbpf.get_variable(pkgconfig : 'includedir') + +bpf_o_unstripped_cmd = [] +if bpf_compiler == 'clang' + bpf_o_unstripped_cmd += [ + clang, + bpf_clang_flags, + bpf_arch_flags, + ] +elif bpf_compiler == 'gcc' + bpf_o_unstripped_cmd += [ + bpf_gcc, + bpf_gcc_flags, + bpf_arch_flags, + ] +endif + +bpf_o_unstripped_cmd += ['-I.'] + +if not meson.is_cross_build() and bpf_compiler == 'clang' + target_triplet_cmd = run_command('gcc', '-dumpmachine', check: false) + if target_triplet_cmd.returncode() == 0 + target_triplet = target_triplet_cmd.stdout().strip() + bpf_o_unstripped_cmd += [ + '-isystem', + '/usr/include/@0@'.format(target_triplet) + ] + endif +endif + +bpf_o_unstripped_cmd += [ + '-idirafter', + libbpf_include_dir, + '@INPUT@', + '-o', + '@OUTPUT@' +] + +if bpftool_strip + bpf_o_cmd = [ + bpftool, + 'gen', + 'object', + '@OUTPUT@', + '@INPUT@' + ] +elif bpf_compiler == 'clang' + bpf_o_cmd = [ + llvm_strip, + '-g', + '@INPUT@', + '-o', + '@OUTPUT@' + ] +endif + +skel_h_cmd = [ + bpftool, + 'gen', + 'skeleton', + '@INPUT@' +] diff --git a/src/core/bpf/restrict_fs/meson.build b/src/core/bpf/restrict_fs/meson.build new file mode 100644 index 0000000..69cde02 --- /dev/null +++ b/src/core/bpf/restrict_fs/meson.build @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +restrict_fs_bpf_o_unstripped = custom_target( + 'restrict-fs.bpf.unstripped.o', + input : 'restrict-fs.bpf.c', + output : 'restrict-fs.bpf.unstripped.o', + command : bpf_o_unstripped_cmd) + +restrict_fs_bpf_o = custom_target( + 'restrict-fs.bpf.o', + input : restrict_fs_bpf_o_unstripped, + output : 'restrict-fs.bpf.o', + command : bpf_o_cmd) + +restrict_fs_skel_h = custom_target( + 'restrict-fs.skel.h', + input : restrict_fs_bpf_o, + output : 'restrict-fs.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/core/bpf/restrict_fs/restrict-fs-skel.h b/src/core/bpf/restrict_fs/restrict-fs-skel.h new file mode 100644 index 0000000..412cf62 --- /dev/null +++ b/src/core/bpf/restrict_fs/restrict-fs-skel.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton + +#include "bpf/restrict_fs/restrict-fs.skel.h" diff --git a/src/core/bpf/restrict_fs/restrict-fs.bpf.c b/src/core/bpf/restrict_fs/restrict-fs.bpf.c new file mode 100644 index 0000000..eb5ed3e --- /dev/null +++ b/src/core/bpf/restrict_fs/restrict-fs.bpf.c @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include <linux/types.h> +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <errno.h> +#include <stddef.h> +#include <stdint.h> + +struct super_block { + unsigned long int s_magic; +} __attribute__((preserve_access_index)); + +struct inode { + struct super_block *i_sb; +} __attribute__((preserve_access_index)); + +struct file { + struct inode *f_inode; +} __attribute__((preserve_access_index)); + +/* + * max_entries is set from user space with the bpf_map__set_max_entries helper. + * */ +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __type(key, uint64_t); /* cgroup ID */ + __type(value, uint32_t); /* fs magic set */ +} cgroup_hash SEC(".maps"); + +SEC("lsm/file_open") +int BPF_PROG(restrict_filesystems, struct file *file, int ret) +{ + unsigned long raw_magic_number; + uint64_t cgroup_id; + uint32_t *value, *magic_map, magic_number, zero = 0, *is_allow; + + /* ret is the return value from the previous BPF program or 0 if it's + * the first hook */ + if (ret != 0) + return ret; + + BPF_CORE_READ_INTO(&raw_magic_number, file, f_inode, i_sb, s_magic); + /* super_block.s_magic is unsigned long, but magic_map keys are + * uint32_t. Using s_magic as-is would fail on big-endian systems, + * which have 64-bit unsigned long. So cast it. */ + magic_number = (uint32_t)raw_magic_number; + + cgroup_id = bpf_get_current_cgroup_id(); + + magic_map = bpf_map_lookup_elem(&cgroup_hash, &cgroup_id); + if (!magic_map) + return 0; + + is_allow = bpf_map_lookup_elem(magic_map, &zero); + if (!is_allow) + /* Malformed map, it doesn't include whether it's an allow list + * or a deny list. Allow. */ + return 0; + + if (*is_allow) { + /* Allow-list: Allow access only if magic_number present in inner map */ + if (!bpf_map_lookup_elem(magic_map, &magic_number)) + return -EPERM; + } else { + /* Deny-list: Allow access only if magic_number is not present in inner map */ + if (bpf_map_lookup_elem(magic_map, &magic_number)) + return -EPERM; + } + + return 0; +} + +static const char _license[] SEC("license") = "GPL"; diff --git a/src/core/bpf/restrict_ifaces/meson.build b/src/core/bpf/restrict_ifaces/meson.build new file mode 100644 index 0000000..5f36178 --- /dev/null +++ b/src/core/bpf/restrict_ifaces/meson.build @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +restrict_ifaces_bpf_o_unstripped = custom_target( + 'restrict-ifaces.bpf.unstripped.o', + input : 'restrict-ifaces.bpf.c', + output : 'restrict-ifaces.bpf.unstripped.o', + command : bpf_o_unstripped_cmd) + +restrict_ifaces_bpf_o = custom_target( + 'restrict-ifaces.bpf.o', + input : restrict_ifaces_bpf_o_unstripped, + output : 'restrict-ifaces.bpf.o', + command : bpf_o_cmd) + +restrict_ifaces_skel_h = custom_target( + 'restrict-ifaces.skel.h', + input : restrict_ifaces_bpf_o, + output : 'restrict-ifaces.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h new file mode 100644 index 0000000..f937490 --- /dev/null +++ b/src/core/bpf/restrict_ifaces/restrict-ifaces-skel.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton + +#include "bpf/restrict_ifaces/restrict-ifaces.skel.h" diff --git a/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c new file mode 100644 index 0000000..32cde5c --- /dev/null +++ b/src/core/bpf/restrict_ifaces/restrict-ifaces.bpf.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* <linux/bpf.h> must precede <bpf/bpf_helpers.h> due to integer types + * in bpf helpers signatures. + */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +const volatile __u8 is_allow_list = 0; + +/* Map containing the network interfaces indexes. + * The interpretation of the map depends on the value of is_allow_list. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u32); + __type(value, __u8); +} sd_restrictif SEC(".maps"); + +#define DROP 0 +#define PASS 1 + +static __always_inline int restrict_network_interfaces_impl(const struct __sk_buff *sk) { + __u32 zero = 0, ifindex; + __u8 *lookup_result; + + ifindex = sk->ifindex; + lookup_result = bpf_map_lookup_elem(&sd_restrictif, &ifindex); + if (is_allow_list) { + /* allow-list: let the packet pass if iface in the list */ + if (lookup_result) + return PASS; + } else { + /* deny-list: let the packet pass if iface *not* in the list */ + if (!lookup_result) + return PASS; + } + + return DROP; +} + +SEC("cgroup_skb/egress") +int sd_restrictif_e(const struct __sk_buff *sk) { + return restrict_network_interfaces_impl(sk); +} + +SEC("cgroup_skb/ingress") +int sd_restrictif_i(const struct __sk_buff *sk) { + return restrict_network_interfaces_impl(sk); +} + +static const char _license[] SEC("license") = "LGPL-2.1-or-later"; diff --git a/src/core/bpf/socket_bind/meson.build b/src/core/bpf/socket_bind/meson.build new file mode 100644 index 0000000..05a2b9d --- /dev/null +++ b/src/core/bpf/socket_bind/meson.build @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: LGPL-2.1-or-later + +if conf.get('BPF_FRAMEWORK') != 1 + subdir_done() +endif + +socket_bind_bpf_o_unstripped = custom_target( + 'socket-bind.bpf.unstripped.o', + input : 'socket-bind.bpf.c', + output : 'socket-bind.bpf.unstripped.o', + command : bpf_o_unstripped_cmd) + +socket_bind_bpf_o = custom_target( + 'socket-bind.bpf.o', + input : socket_bind_bpf_o_unstripped, + output : 'socket-bind.bpf.o', + command : bpf_o_cmd) + +socket_bind_skel_h = custom_target( + 'socket-bind.skel.h', + input : socket_bind_bpf_o, + output : 'socket-bind.skel.h', + command : skel_h_cmd, + capture : true) diff --git a/src/core/bpf/socket_bind/socket-bind-api.bpf.h b/src/core/bpf/socket_bind/socket-bind-api.bpf.h new file mode 100644 index 0000000..277b9bb --- /dev/null +++ b/src/core/bpf/socket_bind/socket-bind-api.bpf.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include <linux/types.h> + +/* + * Bind rule is matched with socket fields accessible to cgroup/bind{4,6} hook + * through bpf_sock_addr struct. + * 'address_family' is expected to be one of AF_UNSPEC, AF_INET or AF_INET6. + * Matching by family is bypassed for rules with AF_UNSPEC set, which makes the + * rest of a rule applicable for both IPv4 and IPv6 addresses. + * If matching by family is either successful or bypassed, a rule and a socket + * are matched by ip protocol. + * If 'protocol' is 0, matching is bypassed. + * 'nr_ports' and 'port_min' fields specify a set of ports to match a user port + * with. + * If 'nr_ports' is 0, matching by port is bypassed, making that rule applicable + * for all possible ports, e.g. [1, 65535] range. Thus a rule with + * 'address_family', 'protocol' and 'nr_ports' equal to AF_UNSPEC, 0 and 0 + * correspondingly forms 'allow any' or 'deny any' cases. + * For positive 'nr_ports', a user_port lying in a range from 'port_min' to' + * 'port_min' + 'nr_ports' exclusively is considered to be a match. 'nr_ports' + * equalling to 1 forms a rule for a single port. + * Ports are in host order. + * + * Examples: + * AF_UNSPEC, 1, 0, 7777: match IPv4 and IPv6 addresses with 7777 user port; + * + * AF_INET, 1023, 0, 1: match IPv4 addresses with user port in [1, 1023] + * range inclusively; + * + * AF_INET6, 0, 0, 0: match IPv6 addresses; + * + * AF_UNSPEC, 0, 0, 0: match IPv4 and IPv6 addresses; + * + * AF_INET6, IPPROTO_TCP, 0, 0: match IPv6/TCP addresses. + */ + +struct socket_bind_rule { + __u32 address_family; + __u32 protocol; + __u16 nr_ports; + __u16 port_min; +}; + +#define SOCKET_BIND_MAX_RULES 128 diff --git a/src/core/bpf/socket_bind/socket-bind-skel.h b/src/core/bpf/socket_bind/socket-bind-skel.h new file mode 100644 index 0000000..e0d1626 --- /dev/null +++ b/src/core/bpf/socket_bind/socket-bind-skel.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +/* libbpf is used via dlopen(), so rename symbols */ +#define bpf_object__open_skeleton sym_bpf_object__open_skeleton +#define bpf_object__load_skeleton sym_bpf_object__load_skeleton +#define bpf_object__destroy_skeleton sym_bpf_object__destroy_skeleton + +#include "bpf/socket_bind/socket-bind.skel.h" diff --git a/src/core/bpf/socket_bind/socket-bind.bpf.c b/src/core/bpf/socket_bind/socket-bind.bpf.c new file mode 100644 index 0000000..b7972a8 --- /dev/null +++ b/src/core/bpf/socket_bind/socket-bind.bpf.c @@ -0,0 +1,111 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +/* The SPDX header above is actually correct in claiming this was + * LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that + * compatible with GPL we will claim this to be GPL however, which should be + * fine given that LGPL-2.1-or-later downgrades to GPL if needed. + */ + +#include "socket-bind-api.bpf.h" +/* <linux/types.h> must precede <bpf/bpf_helpers.h> due to + * <bpf/bpf_helpers.h> does not depend from type header by design. + */ +#include <linux/types.h> +#include <bpf/bpf_endian.h> +#include <bpf/bpf_helpers.h> +#include <linux/bpf.h> +#include <netinet/in.h> +#include <stdbool.h> + +/* + * max_entries is set from user space with bpf_map__set_max_entries helper. + */ +struct socket_bind_map_t { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, __u32); + __type(value, struct socket_bind_rule); +}; + +enum socket_bind_action { + SOCKET_BIND_DENY = 0, + SOCKET_BIND_ALLOW = 1, +}; + +struct socket_bind_map_t sd_bind_allow SEC(".maps"); +struct socket_bind_map_t sd_bind_deny SEC(".maps"); + +static __always_inline bool match_af( + __u8 address_family, const struct socket_bind_rule *r) { + return r->address_family == AF_UNSPEC || address_family == r->address_family; +} + +static __always_inline bool match_protocol( + __u32 protocol, const struct socket_bind_rule *r) { + return r->protocol == 0 || r->protocol == protocol; +} + +static __always_inline bool match_user_port( + __u16 port, const struct socket_bind_rule *r) { + return r->nr_ports == 0 || + (port >= r->port_min && port < r->port_min + (__u32) r->nr_ports); +} + +static __always_inline bool match( + __u8 address_family, + __u32 protocol, + __u16 port, + const struct socket_bind_rule *r) { + return match_af(address_family, r) && + match_protocol(protocol, r) && + match_user_port(port, r); +} + +static __always_inline bool match_rules( + struct bpf_sock_addr *ctx, + struct socket_bind_map_t *rules) { + volatile __u32 user_port = ctx->user_port; + __u16 port = (__u16)bpf_ntohs(user_port); + + for (__u32 i = 0; i < SOCKET_BIND_MAX_RULES; ++i) { + const __u32 key = i; + const struct socket_bind_rule *rule = bpf_map_lookup_elem(rules, &key); + + /* Lookup returns NULL if iterator is advanced past the last + * element put in the map. */ + if (!rule) + break; + + if (match(ctx->user_family, ctx->protocol, port, rule)) + return true; + } + + return false; +} + +static __always_inline int bind_socket(struct bpf_sock_addr *ctx) { + if (match_rules(ctx, &sd_bind_allow)) + return SOCKET_BIND_ALLOW; + + if (match_rules(ctx, &sd_bind_deny)) + return SOCKET_BIND_DENY; + + return SOCKET_BIND_ALLOW; +} + +SEC("cgroup/bind4") +int sd_bind4(struct bpf_sock_addr *ctx) { + if (ctx->user_family != AF_INET || ctx->family != AF_INET) + return SOCKET_BIND_ALLOW; + + return bind_socket(ctx); +} + +SEC("cgroup/bind6") +int sd_bind6(struct bpf_sock_addr *ctx) { + if (ctx->user_family != AF_INET6 || ctx->family != AF_INET6) + return SOCKET_BIND_ALLOW; + + return bind_socket(ctx); +} + +char _license[] SEC("license") = "GPL"; |