diff options
Diffstat (limited to '')
-rw-r--r-- | net/bpf/Makefile | 2 | ||||
-rw-r--r-- | net/bpf/test_run.c | 904 | ||||
-rw-r--r-- | net/bpfilter/.gitignore | 2 | ||||
-rw-r--r-- | net/bpfilter/Kconfig | 23 | ||||
-rw-r--r-- | net/bpfilter/Makefile | 20 | ||||
-rw-r--r-- | net/bpfilter/bpfilter_kern.c | 136 | ||||
-rw-r--r-- | net/bpfilter/bpfilter_umh_blob.S | 7 | ||||
-rw-r--r-- | net/bpfilter/main.c | 64 | ||||
-rw-r--r-- | net/bpfilter/msgfmt.h | 17 |
9 files changed, 1175 insertions, 0 deletions
diff --git a/net/bpf/Makefile b/net/bpf/Makefile new file mode 100644 index 000000000..1c0a98d8c --- /dev/null +++ b/net/bpf/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_BPF_SYSCALL) := test_run.o diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c new file mode 100644 index 000000000..7df14a0e3 --- /dev/null +++ b/net/bpf/test_run.c @@ -0,0 +1,904 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2017 Facebook + */ +#include <linux/bpf.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/etherdevice.h> +#include <linux/filter.h> +#include <linux/sched/signal.h> +#include <net/bpf_sk_storage.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/net_namespace.h> +#include <linux/error-injection.h> +#include <linux/smp.h> +#include <linux/sock_diag.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/bpf_test_run.h> + +struct bpf_test_timer { + enum { NO_PREEMPT, NO_MIGRATE } mode; + u32 i; + u64 time_start, time_spent; +}; + +static void bpf_test_timer_enter(struct bpf_test_timer *t) + __acquires(rcu) +{ + rcu_read_lock(); + if (t->mode == NO_PREEMPT) + preempt_disable(); + else + migrate_disable(); + + t->time_start = ktime_get_ns(); +} + +static void bpf_test_timer_leave(struct bpf_test_timer *t) + __releases(rcu) +{ + t->time_start = 0; + + if (t->mode == NO_PREEMPT) + preempt_enable(); + else + migrate_enable(); + rcu_read_unlock(); +} + +static bool bpf_test_timer_continue(struct bpf_test_timer *t, u32 repeat, int *err, u32 *duration) + __must_hold(rcu) +{ + t->i++; + if (t->i >= repeat) { + /* We're done. */ + t->time_spent += ktime_get_ns() - t->time_start; + do_div(t->time_spent, t->i); + *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; + *err = 0; + goto reset; + } + + if (signal_pending(current)) { + /* During iteration: we've been cancelled, abort. */ + *err = -EINTR; + goto reset; + } + + if (need_resched()) { + /* During iteration: we need to reschedule between runs. */ + t->time_spent += ktime_get_ns() - t->time_start; + bpf_test_timer_leave(t); + cond_resched(); + bpf_test_timer_enter(t); + } + + /* Do another round. */ + return true; + +reset: + t->i = 0; + return false; +} + +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, + u32 *retval, u32 *time, bool xdp) +{ + struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_test_timer t = { NO_MIGRATE }; + enum bpf_cgroup_storage_type stype; + int ret; + + for_each_cgroup_storage_type(stype) { + storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(storage[stype])) { + storage[stype] = NULL; + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + return -ENOMEM; + } + } + + if (!repeat) + repeat = 1; + + bpf_test_timer_enter(&t); + do { + ret = bpf_cgroup_storage_set(storage); + if (ret) + break; + + if (xdp) + *retval = bpf_prog_run_xdp(prog, ctx); + else + *retval = BPF_PROG_RUN(prog, ctx); + + bpf_cgroup_storage_unset(); + + } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_test_timer_leave(&t); + + for_each_cgroup_storage_type(stype) + bpf_cgroup_storage_free(storage[stype]); + + return ret; +} + +static int bpf_test_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size, u32 retval, u32 duration) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.data_out); + int err = -EFAULT; + u32 copy_size = size; + + /* Clamp copy if the user has provided a size hint, but copy the full + * buffer if not to retain old behaviour. + */ + if (kattr->test.data_size_out && + copy_size > kattr->test.data_size_out) { + copy_size = kattr->test.data_size_out; + err = -ENOSPC; + } + + if (data_out && copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) + goto out; + if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) + goto out; + if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + trace_bpf_test_finish(&err); + return err; +} + +/* Integer types of various sizes and pointer combinations cover variety of + * architecture dependent calling conventions. 7+ can be supported in the + * future. + */ +__diag_push(); +__diag_ignore(GCC, 8, "-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); +int noinline bpf_fentry_test1(int a) +{ + return a + 1; +} + +int noinline bpf_fentry_test2(int a, u64 b) +{ + return a + b; +} + +int noinline bpf_fentry_test3(char a, int b, u64 c) +{ + return a + b + c; +} + +int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +{ + return (long)a + b + c + d; +} + +int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +{ + return a + (long)b + c + d + e; +} + +int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +{ + return a + (long)b + c + d + (long)e + f; +} + +struct bpf_fentry_test_t { + struct bpf_fentry_test_t *a; +}; + +int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) +{ + return (long)arg; +} + +int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) +{ + return (long)arg->a; +} + +int noinline bpf_modify_return_test(int a, int *b) +{ + *b += 1; + return a + *b; +} +__diag_pop(); + +ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); + +static void *bpf_test_init(const union bpf_attr *kattr, u32 size, + u32 headroom, u32 tailroom) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + u32 user_size = kattr->test.data_size_in; + void *data; + + if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + return ERR_PTR(-EINVAL); + + if (user_size > size) + return ERR_PTR(-EMSGSIZE); + + size = SKB_DATA_ALIGN(size); + data = kzalloc(size + headroom + tailroom, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(data + headroom, data_in, user_size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + + return data; +} + +int bpf_prog_test_run_tracing(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_fentry_test_t arg = {}; + u16 side_effect = 0, ret = 0; + int b = 2, err = -EFAULT; + u32 retval = 0; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + switch (prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + if (bpf_fentry_test1(1) != 2 || + bpf_fentry_test2(2, 3) != 5 || + bpf_fentry_test3(4, 5, 6) != 15 || + bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || + bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || + bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || + bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || + bpf_fentry_test8(&arg) != 0) + goto out; + break; + case BPF_MODIFY_RETURN: + ret = bpf_modify_return_test(1, &b); + if (b != 2) + side_effect = 1; + break; + default: + goto out; + } + + retval = ((u32)side_effect << 16) | ret; + if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) + goto out; + + err = 0; +out: + trace_bpf_test_finish(&err); + return err; +} + +struct bpf_raw_tp_test_run_info { + struct bpf_prog *prog; + void *ctx; + u32 retval; +}; + +static void +__bpf_prog_test_run_raw_tp(void *data) +{ + struct bpf_raw_tp_test_run_info *info = data; + + rcu_read_lock(); + info->retval = BPF_PROG_RUN(info->prog, info->ctx); + rcu_read_unlock(); +} + +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + __u32 ctx_size_in = kattr->test.ctx_size_in; + struct bpf_raw_tp_test_run_info info; + int cpu = kattr->test.cpu, err = 0; + int current_cpu; + + /* doesn't support data_in/out, ctx_out, duration, or repeat */ + if (kattr->test.data_in || kattr->test.data_out || + kattr->test.ctx_out || kattr->test.duration || + kattr->test.repeat) + return -EINVAL; + + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) + return -EINVAL; + + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) + return -EINVAL; + + if (ctx_size_in) { + info.ctx = kzalloc(ctx_size_in, GFP_USER); + if (!info.ctx) + return -ENOMEM; + if (copy_from_user(info.ctx, ctx_in, ctx_size_in)) { + err = -EFAULT; + goto out; + } + } else { + info.ctx = NULL; + } + + info.prog = prog; + + current_cpu = get_cpu(); + if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || + cpu == current_cpu) { + __bpf_prog_test_run_raw_tp(&info); + } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { + /* smp_call_function_single() also checks cpu_online() + * after csd_lock(). However, since cpu is from user + * space, let's do an extra quick check to filter out + * invalid value before smp_call_function_single(). + */ + err = -ENXIO; + } else { + err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, + &info, 1); + } + put_cpu(); + + if (!err && + copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) + err = -EFAULT; + +out: + kfree(info.ctx); + return err; +} + +static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + u32 size = kattr->test.ctx_size_in; + void *data; + int err; + + if (!data_in && !data_out) + return NULL; + + data = kzalloc(max_size, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (data_in) { + err = bpf_check_uarg_tail_zero(data_in, max_size, size); + if (err) { + kfree(data); + return ERR_PTR(err); + } + + size = min_t(u32, max_size, size); + if (copy_from_user(data, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + } + return data; +} + +static int bpf_ctx_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); + int err = -EFAULT; + u32 copy_size = size; + + if (!data || !data_out) + return 0; + + if (copy_size > kattr->test.ctx_size_out) { + copy_size = kattr->test.ctx_size_out; + err = -ENOSPC; + } + + if (copy_to_user(data_out, data, copy_size)) + goto out; + if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) + goto out; + if (err != -ENOSPC) + err = 0; +out: + return err; +} + +/** + * range_is_zero - test whether buffer is initialized + * @buf: buffer to check + * @from: check from this position + * @to: check up until (excluding) this position + * + * This function returns true if the there is a non-zero byte + * in the buf in the range [from,to). + */ +static inline bool range_is_zero(void *buf, size_t from, size_t to) +{ + return !memchr_inv((u8 *)buf + from, 0, to - from); +} + +static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return 0; + + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) + return -EINVAL; + + /* mark is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), + offsetof(struct __sk_buff, priority))) + return -EINVAL; + + /* priority is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), + offsetof(struct __sk_buff, ifindex))) + return -EINVAL; + + /* ifindex is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), + offsetof(struct __sk_buff, cb))) + return -EINVAL; + + /* cb is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), + offsetof(struct __sk_buff, tstamp))) + return -EINVAL; + + /* tstamp is allowed */ + /* wire_len is allowed */ + /* gso_segs is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), + offsetof(struct __sk_buff, gso_size))) + return -EINVAL; + + /* gso_size is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), + sizeof(struct __sk_buff))) + return -EINVAL; + + skb->mark = __skb->mark; + skb->priority = __skb->priority; + skb->tstamp = __skb->tstamp; + memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); + + if (__skb->wire_len == 0) { + cb->pkt_len = skb->len; + } else { + if (__skb->wire_len < skb->len || + __skb->wire_len > GSO_MAX_SIZE) + return -EINVAL; + cb->pkt_len = __skb->wire_len; + } + + if (__skb->gso_segs > GSO_MAX_SEGS) + return -EINVAL; + skb_shinfo(skb)->gso_segs = __skb->gso_segs; + skb_shinfo(skb)->gso_size = __skb->gso_size; + + return 0; +} + +static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) +{ + struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; + + if (!__skb) + return; + + __skb->mark = skb->mark; + __skb->priority = skb->priority; + __skb->ifindex = skb->dev->ifindex; + __skb->tstamp = skb->tstamp; + memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); + __skb->wire_len = cb->pkt_len; + __skb->gso_segs = skb_shinfo(skb)->gso_segs; +} + +static struct proto bpf_dummy_proto = { + .name = "bpf_dummy", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + bool is_l2 = false, is_direct_pkt_access = false; + struct net *net = current->nsproxy->net_ns; + struct net_device *dev = net->loopback_dev; + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + struct __sk_buff *ctx = NULL; + u32 retval, duration; + int hh_len = ETH_HLEN; + struct sk_buff *skb; + struct sock *sk; + void *data; + int ret; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); + if (IS_ERR(data)) + return PTR_ERR(data); + + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) { + kfree(data); + return PTR_ERR(ctx); + } + + switch (prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + is_l2 = true; + fallthrough; + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + is_direct_pkt_access = true; + break; + default: + break; + } + + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); + if (!sk) { + kfree(data); + kfree(ctx); + return -ENOMEM; + } + sock_init_data(NULL, sk); + + skb = build_skb(data, 0); + if (!skb) { + kfree(data); + kfree(ctx); + sk_free(sk); + return -ENOMEM; + } + skb->sk = sk; + + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + __skb_put(skb, size); + if (ctx && ctx->ifindex > 1) { + dev = dev_get_by_index(net, ctx->ifindex); + if (!dev) { + ret = -ENODEV; + goto out; + } + } + skb->protocol = eth_type_trans(skb, dev); + skb_reset_network_header(skb); + + switch (skb->protocol) { + case htons(ETH_P_IP): + sk->sk_family = AF_INET; + if (sizeof(struct iphdr) <= skb_headlen(skb)) { + sk->sk_rcv_saddr = ip_hdr(skb)->saddr; + sk->sk_daddr = ip_hdr(skb)->daddr; + } + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + sk->sk_family = AF_INET6; + if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { + sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; + sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; + } + break; +#endif + default: + break; + } + + if (is_l2) + __skb_push(skb, hh_len); + if (is_direct_pkt_access) + bpf_compute_data_pointers(skb); + ret = convert___skb_to_skb(skb, ctx); + if (ret) + goto out; + ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); + if (ret) + goto out; + if (!is_l2) { + if (skb_headroom(skb) < hh_len) { + int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); + + if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { + ret = -ENOMEM; + goto out; + } + } + memset(__skb_push(skb, hh_len), 0, hh_len); + } + convert_skb_to___skb(skb, ctx); + + size = skb->len; + /* bpf program can never convert linear skb to non-linear */ + if (WARN_ON_ONCE(skb_is_nonlinear(skb))) + size = skb_headlen(skb); + ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct __sk_buff)); +out: + if (dev && dev != net->loopback_dev) + dev_put(dev); + kfree_skb(skb); + sk_free(sk); + kfree(ctx); + return ret; +} + +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 headroom = XDP_PACKET_HEADROOM; + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + struct netdev_rx_queue *rxqueue; + struct xdp_buff xdp = {}; + u32 retval, duration; + u32 max_data_sz; + void *data; + int ret; + + if (prog->expected_attach_type == BPF_XDP_DEVMAP || + prog->expected_attach_type == BPF_XDP_CPUMAP) + return -EINVAL; + if (kattr->test.ctx_in || kattr->test.ctx_out) + return -EINVAL; + + /* XDP have extra tailroom as (most) drivers use full page */ + max_data_sz = 4096 - headroom - tailroom; + + data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); + if (IS_ERR(data)) + return PTR_ERR(data); + + xdp.data_hard_start = data; + xdp.data = data + headroom; + xdp.data_meta = xdp.data; + xdp.data_end = xdp.data + size; + xdp.frame_sz = headroom + max_data_sz + tailroom; + + rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); + xdp.rxq = &rxqueue->xdp_rxq; + bpf_prog_change_xdp(NULL, prog); + ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); + if (ret) + goto out; + if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) + size = xdp.data_end - xdp.data; + ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); +out: + bpf_prog_change_xdp(prog, NULL); + kfree(data); + return ret; +} + +static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) +{ + /* make sure the fields we don't use are zeroed */ + if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) + return -EINVAL; + + /* flags is allowed */ + + if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), + sizeof(struct bpf_flow_keys))) + return -EINVAL; + + return 0; +} + +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + u32 size = kattr->test.data_size_in; + struct bpf_flow_dissector ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_flow_keys *user_ctx; + struct bpf_flow_keys flow_keys; + const struct ethhdr *eth; + unsigned int flags = 0; + u32 retval, duration; + void *data; + int ret; + + if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (size < ETH_HLEN) + return -EINVAL; + + data = bpf_test_init(kattr, size, 0, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + eth = (struct ethhdr *)data; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); + if (IS_ERR(user_ctx)) { + kfree(data); + return PTR_ERR(user_ctx); + } + if (user_ctx) { + ret = verify_user_bpf_flow_keys(user_ctx); + if (ret) + goto out; + flags = user_ctx->flags; + } + + ctx.flow_keys = &flow_keys; + ctx.data = data; + ctx.data_end = (__u8 *)data + size; + + bpf_test_timer_enter(&t); + do { + retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, + size, flags); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), + retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, + sizeof(struct bpf_flow_keys)); + +out: + kfree(user_ctx); + kfree(data); + return ret; +} + +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_prog_array *progs = NULL; + struct bpf_sk_lookup_kern ctx = {}; + u32 repeat = kattr->test.repeat; + struct bpf_sk_lookup *user_ctx; + u32 retval, duration; + int ret = -EINVAL; + + if (prog->type != BPF_PROG_TYPE_SK_LOOKUP) + return -EINVAL; + + if (kattr->test.flags || kattr->test.cpu) + return -EINVAL; + + if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || + kattr->test.data_size_out) + return -EINVAL; + + if (!repeat) + repeat = 1; + + user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); + if (IS_ERR(user_ctx)) + return PTR_ERR(user_ctx); + + if (!user_ctx) + return -EINVAL; + + if (user_ctx->sk) + goto out; + + if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) + goto out; + + if (user_ctx->local_port > U16_MAX || user_ctx->remote_port > U16_MAX) { + ret = -ERANGE; + goto out; + } + + ctx.family = (u16)user_ctx->family; + ctx.protocol = (u16)user_ctx->protocol; + ctx.dport = (u16)user_ctx->local_port; + ctx.sport = (__force __be16)user_ctx->remote_port; + + switch (ctx.family) { + case AF_INET: + ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; + ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; + break; + +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; + ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; + break; +#endif + + default: + ret = -EAFNOSUPPORT; + goto out; + } + + progs = bpf_prog_array_alloc(1, GFP_KERNEL); + if (!progs) { + ret = -ENOMEM; + goto out; + } + + progs->items[0].prog = prog; + + bpf_test_timer_enter(&t); + do { + ctx.selected_sk = NULL; + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); + bpf_test_timer_leave(&t); + + if (ret < 0) + goto out; + + user_ctx->cookie = 0; + if (ctx.selected_sk) { + if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { + ret = -EOPNOTSUPP; + goto out; + } + + user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); + } + + ret = bpf_test_finish(kattr, uattr, NULL, 0, retval, duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); + +out: + bpf_prog_array_free(progs); + kfree(user_ctx); + return ret; +} diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore new file mode 100644 index 000000000..f34e85ee8 --- /dev/null +++ b/net/bpfilter/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +bpfilter_umh diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig new file mode 100644 index 000000000..8ad0233ce --- /dev/null +++ b/net/bpfilter/Kconfig @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0-only +menuconfig BPFILTER + bool "BPF based packet filtering framework (BPFILTER)" + depends on NET && BPF && INET + select USERMODE_DRIVER + help + This builds experimental bpfilter framework that is aiming to + provide netfilter compatible functionality via BPF + +if BPFILTER +config BPFILTER_UMH + tristate "bpfilter kernel module with user mode helper" + depends on CC_CAN_LINK + depends on m || CC_CAN_LINK_STATIC + default m + help + This builds bpfilter kernel module with embedded user mode helper + + Note: To compile this as built-in, your toolchain must support + building static binaries, since rootfs isn't mounted at the time + when __init functions are called and do_execv won't be able to find + the elf interpreter. +endif diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile new file mode 100644 index 000000000..cdac82b8c --- /dev/null +++ b/net/bpfilter/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux BPFILTER layer. +# + +userprogs := bpfilter_umh +bpfilter_umh-objs := main.o +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi + +ifeq ($(CONFIG_BPFILTER_UMH), y) +# builtin bpfilter_umh should be linked with -static +# since rootfs isn't mounted at the time of __init +# function is called and do_execv won't find elf interpreter +userldflags += -static +endif + +$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh + +obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o +bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c new file mode 100644 index 000000000..51a941b56 --- /dev/null +++ b/net/bpfilter/bpfilter_kern.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/init.h> +#include <linux/module.h> +#include <linux/umh.h> +#include <linux/bpfilter.h> +#include <linux/sched.h> +#include <linux/sched/signal.h> +#include <linux/fs.h> +#include <linux/file.h> +#include "msgfmt.h" + +extern char bpfilter_umh_start; +extern char bpfilter_umh_end; + +static void shutdown_umh(void) +{ + struct umd_info *info = &bpfilter_ops.info; + struct pid *tgid = info->tgid; + + if (tgid) { + kill_pid(tgid, SIGKILL, 1); + wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); + bpfilter_umh_cleanup(info); + } +} + +static void __stop_umh(void) +{ + if (IS_ENABLED(CONFIG_INET)) + shutdown_umh(); +} + +static int bpfilter_send_req(struct mbox_request *req) +{ + struct mbox_reply reply; + loff_t pos = 0; + ssize_t n; + + if (!bpfilter_ops.info.tgid) + return -EFAULT; + pos = 0; + n = kernel_write(bpfilter_ops.info.pipe_to_umh, req, sizeof(*req), + &pos); + if (n != sizeof(*req)) { + pr_err("write fail %zd\n", n); + goto stop; + } + pos = 0; + n = kernel_read(bpfilter_ops.info.pipe_from_umh, &reply, sizeof(reply), + &pos); + if (n != sizeof(reply)) { + pr_err("read fail %zd\n", n); + goto stop; + } + return reply.status; +stop: + __stop_umh(); + return -EFAULT; +} + +static int bpfilter_process_sockopt(struct sock *sk, int optname, + sockptr_t optval, unsigned int optlen, + bool is_set) +{ + struct mbox_request req = { + .is_set = is_set, + .pid = current->pid, + .cmd = optname, + .addr = (uintptr_t)optval.user, + .len = optlen, + }; + if (uaccess_kernel() || sockptr_is_kernel(optval)) { + pr_err("kernel access not supported\n"); + return -EFAULT; + } + return bpfilter_send_req(&req); +} + +static int start_umh(void) +{ + struct mbox_request req = { .pid = current->pid }; + int err; + + /* fork usermode process */ + err = fork_usermode_driver(&bpfilter_ops.info); + if (err) + return err; + pr_info("Loaded bpfilter_umh pid %d\n", pid_nr(bpfilter_ops.info.tgid)); + + /* health check that usermode process started correctly */ + if (bpfilter_send_req(&req) != 0) { + shutdown_umh(); + return -EFAULT; + } + + return 0; +} + +static int __init load_umh(void) +{ + int err; + + err = umd_load_blob(&bpfilter_ops.info, + &bpfilter_umh_start, + &bpfilter_umh_end - &bpfilter_umh_start); + if (err) + return err; + + mutex_lock(&bpfilter_ops.lock); + err = start_umh(); + if (!err && IS_ENABLED(CONFIG_INET)) { + bpfilter_ops.sockopt = &bpfilter_process_sockopt; + bpfilter_ops.start = &start_umh; + } + mutex_unlock(&bpfilter_ops.lock); + if (err) + umd_unload_blob(&bpfilter_ops.info); + return err; +} + +static void __exit fini_umh(void) +{ + mutex_lock(&bpfilter_ops.lock); + if (IS_ENABLED(CONFIG_INET)) { + shutdown_umh(); + bpfilter_ops.start = NULL; + bpfilter_ops.sockopt = NULL; + } + mutex_unlock(&bpfilter_ops.lock); + + umd_unload_blob(&bpfilter_ops.info); +} +module_init(load_umh); +module_exit(fini_umh); +MODULE_LICENSE("GPL"); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S new file mode 100644 index 000000000..40311d10d --- /dev/null +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpfilter_umh_start +bpfilter_umh_start: + .incbin "net/bpfilter/bpfilter_umh" + .global bpfilter_umh_end +bpfilter_umh_end: diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c new file mode 100644 index 000000000..291a92546 --- /dev/null +++ b/net/bpfilter/main.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sys/uio.h> +#include <errno.h> +#include <stdio.h> +#include <sys/socket.h> +#include <fcntl.h> +#include <unistd.h> +#include "../../include/uapi/linux/bpf.h" +#include <asm/unistd.h> +#include "msgfmt.h" + +FILE *debug_f; + +static int handle_get_cmd(struct mbox_request *cmd) +{ + switch (cmd->cmd) { + case 0: + return 0; + default: + break; + } + return -ENOPROTOOPT; +} + +static int handle_set_cmd(struct mbox_request *cmd) +{ + return -ENOPROTOOPT; +} + +static void loop(void) +{ + while (1) { + struct mbox_request req; + struct mbox_reply reply; + int n; + + n = read(0, &req, sizeof(req)); + if (n != sizeof(req)) { + fprintf(debug_f, "invalid request %d\n", n); + return; + } + + reply.status = req.is_set ? + handle_set_cmd(&req) : + handle_get_cmd(&req); + + n = write(1, &reply, sizeof(reply)); + if (n != sizeof(reply)) { + fprintf(debug_f, "reply failed %d\n", n); + return; + } + } +} + +int main(void) +{ + debug_f = fopen("/dev/kmsg", "w"); + setvbuf(debug_f, 0, _IOLBF, 0); + fprintf(debug_f, "<5>Started bpfilter\n"); + loop(); + fclose(debug_f); + return 0; +} diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h new file mode 100644 index 000000000..98d121c62 --- /dev/null +++ b/net/bpfilter/msgfmt.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_BPFILTER_MSGFMT_H +#define _NET_BPFILTER_MSGFMT_H + +struct mbox_request { + __u64 addr; + __u32 len; + __u32 is_set; + __u32 cmd; + __u32 pid; +}; + +struct mbox_reply { + __u32 status; +}; + +#endif |